Beispiel #1
0
    def run(self):
        while not self.q.empty():
            work = self.q.get(timeout=0)
            logging.info("the number of work in queue: " + str(self.q.qsize()))

            login = work["login"]
            # get db connection
            db = base.connectMysqlDB(config, autocommit=False)
            cur = db.cursor()

            # read data from file
            try:
                directory = base_path + "/" + login
                files = os.listdir(directory)
                for file in files:
                    file_path = directory + "/" + file
                    text = base.get_info_from_file(file_path)
                    if text is False:
                        logging.warn("file not existed: " + file_path)
                    else:
                        obj = json.loads(text)
                        logging.info("read file: " + file_path)
                        count = 1
                        for edge in obj["data"]["user"][
                                "sponsorshipsAsMaintainer"]["edges"]:
                            if edge["node"]["privacyLevel"] == "PRIVATE":
                                cur.execute(
                                    "insert into github_sponsorships_as_maintainer "
                                    "(login, flag, created_at) "
                                    "values (%s, %s, %s)",
                                    (obj["data"]["user"]["login"], base.flag2,
                                     base.time_handler(
                                         edge["node"]["createdAt"])))
                            else:
                                if "company" in edge["node"]["sponsorEntity"]:
                                    flag = base.flag0
                                else:
                                    flag = base.flag1
                                cur.execute(
                                    "insert into github_sponsorships_as_maintainer "
                                    "(login, sponsor_login, flag, created_at) "
                                    "values (%s, %s, %s, %s)",
                                    (obj["data"]["user"]["login"],
                                     edge["node"]["sponsorEntity"]["login"],
                                     flag,
                                     base.time_handler(
                                         edge["node"]["createdAt"])))
                            db.commit()
                            logging.info("the " + str(count) +
                                         "th record in file: " + file_path)
                            count += 1
                self.q.task_done()
                cur.close()
                db.close()
            except Exception as e:
                logging.fatal(e)
Beispiel #2
0
    def run(self):
        while not self.q.empty():
            work = self.q.get()
            logging.info("the number of work in queue: " + str(self.q.qsize()))

            login = work["login"]
            # get db connection
            db = base.connectMysqlDB(config, autocommit=False)
            cur = db.cursor()

            # read data from file
            file = base_path + "/" + login + ".json"
            text = base.get_info_from_file(file)
            if text is False:
                logging.warn("file not existed: " + file)
            else:
                obj = json.loads(text)
                logging.info("writing login data: " + login)
                if obj["data"]["user"]["hasSponsorsListing"] is True:
                    cur.execute(
                        "insert into github_user "
                        "(database_id, login, name, email,spon_maintainer_count,"
                        " spon_sponsor_count, created_at, updated_at, has_sponsors_listing) "
                        "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
                        (obj["data"]["user"]["databaseId"],
                         obj["data"]["user"]["login"],
                         obj["data"]["user"]["name"],
                         obj["data"]["user"]["email"], obj["data"]["user"]
                         ["sponsorshipsAsMaintainer"]["totalCount"],
                         obj["data"]["user"]["sponsorshipsAsSponsor"]
                         ["totalCount"],
                         base.time_handler(obj["data"]["user"]["createdAt"]),
                         base.time_handler(
                             obj["data"]["user"]["updatedAt"]), "1"))
                else:
                    cur.execute(
                        "insert into github_user "
                        "(database_id, login, name, email,spon_maintainer_count,"
                        " spon_sponsor_count, created_at, updated_at, has_sponsors_listing) "
                        "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
                        (obj["data"]["user"]["databaseId"],
                         obj["data"]["user"]["login"],
                         obj["data"]["user"]["name"],
                         obj["data"]["user"]["email"], obj["data"]["user"]
                         ["sponsorshipsAsMaintainer"]["totalCount"],
                         obj["data"]["user"]["sponsorshipsAsSponsor"]
                         ["totalCount"],
                         base.time_handler(obj["data"]["user"]["createdAt"]),
                         base.time_handler(
                             obj["data"]["user"]["updatedAt"]), "0"))
                db.commit()
                logging.info(login +
                             " ~~~~~~~~~ data commit into dababase success!!")
            self.q.task_done()
            cur.close()
            db.close()
Beispiel #3
0
    def run(self):
        while not self.q.empty():
            work = self.q.get(timeout=0)
            logging.info("the number of work in queue: " + str(self.q.qsize()))

            login = work["login"]
            # get db connection
            db = base.connectMysqlDB(config, autocommit=False)
            cur = db.cursor()

            # read data from file
            try:
                file = base_path + "/" + login + ".json"
                text = base.get_info_from_file(file)
                if text is False:
                    logging.warn("file not existed: " + file)
                else:
                    obj = json.loads(text)
                    if obj["data"]["user"]["sponsorsListing"] is not None:
                        logging.info(login + " ~~~~~~~~~ has " +
                                     str(obj["data"]["user"]["sponsorsListing"]
                                         ["tiers"]["totalCount"]) + " tiers")
                        count = 1
                        for edge in obj["data"]["user"]["sponsorsListing"][
                                "tiers"]["edges"]:
                            cur.execute(
                                "insert into github_sponsor_listing_tiers "
                                "(login, slug, monthly_price_in_cents, monthly_price_in_dollars, name, created_at, updated_at, description) "
                                "values (%s, %s, %s, %s, %s, %s, %s, %s)",
                                (obj["data"]["user"]["login"], obj["data"]
                                 ["user"]["sponsorsListing"]["slug"],
                                 edge["node"]["monthlyPriceInCents"],
                                 edge["node"]["monthlyPriceInDollars"],
                                 edge["node"]["name"],
                                 base.time_handler(edge["node"]["createdAt"]),
                                 base.time_handler(edge["node"]["updatedAt"]),
                                 edge["node"]["description"]))
                            db.commit()
                            # logging.info("the " + str(count) + "th tier data commit into dababase success!!")
                            count += 1
                    else:
                        logging.warn("login: "******" don't have sponsor_listing")
                        logging.warn("sponsor_listing: " + str(obj))
                self.q.task_done()
                cur.close()
                db.close()
            except Exception as e:
                logging.fatal(e)
                logging.error(e)
Beispiel #4
0
    def run(self):
        while not self.q.empty():
            work = self.q.get(timeout=0)
            logging.info("the number of work in queue: " + str(self.q.qsize()))

            login = work["login"]
            # get db connection
            db = base.connectMysqlDB(config, autocommit=False)
            cur = db.cursor()

            # read data from file
            try:
                directory = base_path + "/" + login
                files = base.read_all_filename_in_directory(directory)
                for file in files:
                    text = base.get_info_from_file(file)
                    if text is False:
                        logging.warn("file not existed: " + file)
                    else:
                        obj = json.loads(text)
                        logging.info("read file: " + file)
                        count = 1
                        if "edges" not in obj["data"]["user"][
                                "commitComments"]:
                            continue
                        for node in obj["data"]["user"]["commitComments"][
                                "edges"]:
                            logging.info("the " + str(count) +
                                         "th record in file: " + file)
                            if node["node"]["commit"] is not None:
                                oid = node["node"]["commit"]["oid"]
                            else:
                                oid = ""
                            cur.execute(
                                "insert into github_commit_comment "
                                "(comm_database_id, login, created_at, updated_at, body, commit_oid) "
                                "values (%s, %s, %s, %s, %s, %s)",
                                (node["node"]["databaseId"],
                                 obj["data"]["user"]["login"],
                                 base.time_handler(node["node"]["createdAt"]),
                                 base.time_handler(node["node"]["updatedAt"]),
                                 node["node"]["body"], oid))
                            db.commit()
                            count += 1
                self.q.task_done()
                cur.close()
                db.close()
            except Exception as e:
                logging.fatal(e)
                return
Beispiel #5
0
    def run(self):
        while not self.q.empty():
            work = self.q.get(timeout=0)
            logging.info("the number of work in queue: " + str(self.q.qsize()))

            login = work["login"]
            # get db connection
            db = base.connectMysqlDB(config, autocommit=False)
            cur = db.cursor()

            # read data from file
            try:
                file = base_path + "/" + login + ".json"
                text = base.get_info_from_file(file)
                if text is False:
                    logging.warn("file not existed: " + file)
                else:
                    obj = json.loads(text)
                    if obj["data"]["user"]["sponsorsListing"] is None:
                        logging.info("user: "******" don't create sponsors")
                    else:
                        cur.execute(
                            "SELECT * FROM github_sponsor_listing WHERE login='******'")
                        items = cur.fetchall()
                        if len(items) == 1:
                            logging.info("user: "******" had been inserted into database!")
                        else:
                            cur.execute(
                                "insert into github_sponsor_listing "
                                "(login, slug, name, tiers_total_count, created_at, short_description) "
                                "values (%s, %s, %s, %s, %s, %s)",
                                (obj["data"]["user"]["login"], obj["data"]
                                 ["user"]["sponsorsListing"]["slug"],
                                 obj["data"]["user"]["sponsorsListing"]
                                 ["name"], obj["data"]["user"]
                                 ["sponsorsListing"]["tiers"]["totalCount"],
                                 base.time_handler(
                                     obj["data"]["user"]["sponsorsListing"]
                                     ["createdAt"]), obj["data"]["user"]
                                 ["sponsorsListing"]["shortDescription"]))
                            db.commit()
                            logging.info(
                                login +
                                " ~~~~~~~~~ data commit into dababase success!!"
                            )
                self.q.task_done()
                cur.close()
                db.close()
            except Exception as e:
                logging.fatal(e)
                return
Beispiel #6
0
    def run(self):
        while not self.q.empty():
            work = self.q.get(timeout=0)
            logging.info("the number of work in queue: " + str(self.q.qsize()))

            login = work["login"]
            # get db connection
            db = base.connectMysqlDB(config, autocommit=False)
            cur = db.cursor()

            # read data from file
            try:
                directory = base_path + "/" + login
                files = base.read_all_filename_in_directory(directory)
                for file in files:
                    text = base.get_info_from_file(file)
                    if text is False:
                        logging.warn("file not existed: " + file)
                    else:
                        obj = json.loads(text)
                        logging.info("read file: " + file)
                        count = 1
                        for node in obj["data"]["user"][
                                "contributionsCollection"][
                                    "pullRequestReviewContributions"]["edges"]:
                            try:  # maybe happen duplicate key when insert data
                                cur.execute(
                                    "insert ignore into github_user_pr_review "
                                    "(pr_database_id, login, created_at, body) "
                                    "values (%s, %s, %s, %s)",
                                    (node["node"]["pullRequestReview"]
                                     ["databaseId"], node["node"]
                                     ["pullRequestReview"]["author"]["login"],
                                     base.time_handler(
                                         node["node"]["pullRequestReview"]
                                         ["createdAt"]), node["node"]
                                     ["pullRequestReview"]["body"]))
                                db.commit()
                                # logging.info("the " + str(count) + "th record in file: " + file)
                            except Exception as e:
                                logging.error(e)
                            count += 1
                self.q.task_done()
                cur.close()
                db.close()
            except Exception as e:
                logging.fatal(e)
                return
    def run(self):
        # get db connection
        db = connectMysqlDB(config, autocommit=False)
        cur = db.cursor()

        while True:
            try:
                print ""
                work = self.q.get(timeout=0)
                print "the number of work in queue: " + str(self.q.qsize())

                number = work["number"]
                owner = work["owner"]
                repo = work["repo"]
                page = 1
                sum = 0  # sum of inserted db

                # every comments has several pages of reaction
                while True:
                    # get a suitable token and combine header
                    github_token = get_token()
                    headers = {
                        'User-Agent':
                        'Mozilla/5.0',
                        'Authorization':
                        'token ' + github_token,
                        'Content-Type':
                        'application/json',
                        'method':
                        'GET',
                        'Accept':
                        'application/vnd.github.squirrel-girl-preview+json'
                    }
                    # print "headers is: " + str(headers)

                    # combine url
                    url = "https://api.github.com/repos/" + owner + "/" + repo + "/issues" + "/" + str(
                        number) + "/reactions"
                    url = url + "?page=" + str(page)
                    print "url is: " + url

                    try:
                        # request data and parse response
                        req = urllib2.Request(url=url, headers=headers)
                        response = urllib2.urlopen(req)
                        result = json.loads(response.read().decode("utf-8"))
                        # print result

                        length = len(result)
                        sum += length
                        if length == 0:
                            print "finish, comment " + str(
                                number) + " has reactions: " + str(sum)
                            self.q.task_done()
                            break

                        # write file
                        json_str = json.dumps(result)
                        # print "json format data: " + json_str
                        filename = base_path + "/" + owner + "&" + repo + "/" + str(
                            number) + "/" + str(page) + ".json"
                        flag = base.generate_file(filename, json_str)
                        if flag is True:
                            print "create file successfully: " + filename
                        elif flag is False:
                            print "file is already existed: " + filename
                        else:
                            print "create file failed: " + flag + " filename: " + filename
                            continue

                        page += 1  # page++

                        # handle response json data
                        num = 0
                        while num < length:
                            insert_dict = {}
                            if "id" not in result[num]:
                                insert_dict["id"] = None
                            else:
                                insert_dict["id"] = result[num]["id"]
                            if "login" not in result[num]["user"]:
                                insert_dict["user_login"] = None
                            else:
                                insert_dict["user_login"] = result[num][
                                    "user"]["login"]
                            if "created_at" not in result[num]:
                                insert_dict["created_at"] = None
                            else:
                                insert_dict["created_at"] = result[num][
                                    "created_at"]
                            if "content" not in result[num]:
                                insert_dict["content"] = None
                            else:
                                insert_dict["content"] = result[num]["content"]
                            print "insert info: " + str(insert_dict)

                            # insert data to database table
                            try:
                                if insert_dict is not None:
                                    cur.execute(
                                        "insert into github_reaction "
                                        "(id, number, user_login, owner_login, repo, created_at, flag, content) "
                                        "values (%s, %s, %s, %s, %s, %s, %s, %s)",
                                        (insert_dict["id"], number,
                                         insert_dict["user_login"], owner,
                                         repo,
                                         base.time_handler(
                                             insert_dict["created_at"]), 1,
                                         insert_dict["content"]))
                                    db.commit()
                            except Exception as e:
                                print str(e)
                            num += 1
                    except Exception as e:
                        print str(e) + " error with this page: " + url
                        # if e.code == 403:
                        #     break
                        # if e.code != 404:
                        #     # mainly 403, sometimes 503
                        #     # token rate limit
                        #     self.q.put(work)  # put into the queue again
                        #     sleep_time_tokens[github_token] = time.time()  # set sleep time for that token
                        #     insert_dict = None
                        # else:
                        #     insert_dict["body"] = "404 error"
                        #     insert_dict["created_at"] = None
                        #     insert_dict["updated_at"] = None
                    else:
                        pass  # 403... error

            except Queue.Empty:
                cur.close()
                db.close()
                return
            except Exception as e:
                print str(
                    e
                ) + "qiubing"  # unexpected error, don't interrupt the program
Beispiel #8
0
    def run(self):
        while not self.q.empty():
            work = self.q.get(timeout=0)
            logging.info("the number of work in queue: " + str(self.q.qsize()))

            login = work["login"]
            # get db connection
            db = base.connectMysqlDB(config, autocommit=False)
            cur = db.cursor()

            # read data from file
            try:
                directory = base_path + "/" + login
                files = os.listdir(directory)
                for file in files:
                    file_path = directory + "/" + file
                    text = base.get_info_from_file(file_path)
                    if text is False:
                        logging.warn("file not existed: " + file_path)
                        continue
                    obj = json.loads(text)
                    print "read file: " + file_path
                    count = 1
                    # github user 接受了打赏,但是没有打赏过别人。
                    # 之所以将这部分数据写入 github_sponsorships_as_sponsor 表中,是为了做筛选
                    if len(obj["data"]["user"]["sponsorshipsAsSponsor"]
                           ["edges"]) == 0:
                        logging.warn("the user " + login +
                                     " doesn't sponsor others")
                        cur.execute(
                            "insert into github_sponsorships_as_sponsor "
                            "(login, sponsor_login, flag) "
                            "values (%s, %s, %s)",
                            (login, login, str(base.flag4)))
                        db.commit()
                        continue
                    for edge in obj["data"]["user"]["sponsorshipsAsSponsor"][
                            "edges"]:
                        if edge["node"]["privacyLevel"] == "PRIVATE":
                            logging.info("the " + str(count) +
                                         "th record is private in file: " +
                                         file_path)
                            count += 1
                            continue
                        else:
                            slug = edge["node"]["sponsorable"][
                                "sponsorsListing"]["slug"].split("-")[1]
                            cur.execute(
                                "insert into github_sponsorships_as_sponsor "
                                "(login, slug, sponsor_login, flag, created_at) "
                                "values (%s, %s, %s, %s, %s)",
                                (slug, edge["node"]["sponsorable"]
                                 ["sponsorsListing"]["slug"],
                                 obj["data"]["user"]["login"], str(3),
                                 base.time_handler(edge["node"]["createdAt"])))
                        db.commit()
                        logging.info("the " + str(count) +
                                     "th record in file: " + file_path)
                        count += 1
                self.q.task_done()
                cur.close()
                db.close()
            except Exception as e:
                logging.fatal(e)
Beispiel #9
0
    def run(self):
        work = self.q.get(timeout=0)
        print "the number of work in queue: " + str(self.q.qsize())

        id = work["repo_id"]
        owner = work["owner"]
        repo = work["repo"]
        page = 1
        sum = 0  # sum of inserted db

        # get db connection
        db = connectMysqlDB(config, autocommit=False)
        cur = db.cursor()

        while True:
            print ""
            try:
                # get a suitable token and combine header
                github_token = get_token()
                headers = {
                    'User-Agent': 'Mozilla/5.0',
                    'Authorization': 'token ' + github_token,
                    'Content-Type': 'application/json',
                    'method': 'GET',
                    'Accept':
                    'application/vnd.github.squirrel-girl-preview+json'
                }
                # print "headers is: " + str(headers)

                # combine url, notice: per page is 30
                url = "https://api.github.com/repos/" + owner + "/" + repo + "/issues"
                url = url + "?state=all" + "&page=" + str(
                    page) + "&per_page=30"
                print "url is: " + url

                insert_dict = {}
                try:
                    # request data and parse response
                    req = urllib2.Request(url=url, headers=headers)
                    response = urllib2.urlopen(req)
                    result = json.loads(response.read().decode("utf-8"))
                    # print result

                    # judge response info empty
                    length = len(result)
                    sum += length
                    if length == 0:
                        # close the db connection
                        cur.close()
                        db.close()
                        print "finish & the sum of issue of pull request is: " + str(
                            sum)
                        self.q.task_done()
                        return

                    # write file
                    json_str = json.dumps(result)
                    # print "json format data: " + json_str
                    filename = base_path + "/" + owner + "&" + repo + "&" + str(
                        id) + "/" + str(page) + ".json"
                    flag = base.generate_file(filename, json_str)
                    if flag is True:
                        print "create file successfully: " + filename
                    elif flag is False:
                        print "file is already existed: " + filename
                    else:
                        print "create file failed: " + flag + " filename: " + filename
                        continue

                    page += 1  # page++

                    # handle response json data
                    num = 0
                    while num < length:
                        insert_dict = {}
                        if "id" not in result[num]:
                            insert_dict["id"] = None
                        else:
                            insert_dict["id"] = result[num]["id"]
                        if "number" not in result[num]:
                            insert_dict["number"] = None
                        else:
                            insert_dict["number"] = result[num]["number"]
                        if "comments" not in result[num]:
                            insert_dict["comments"] = None
                        else:
                            insert_dict["comments"] = result[num]["comments"]
                        if "created_at" not in result[num]:
                            insert_dict["created_at"] = None
                        else:
                            insert_dict["created_at"] = result[num][
                                "created_at"]
                        if "updated_at" not in result[num]:
                            insert_dict["updated_at"] = None
                        else:
                            insert_dict["updated_at"] = result[num][
                                "updated_at"]
                        if "login" not in result[num]["user"]:
                            insert_dict["user_login"] = None
                        else:
                            insert_dict["user_login"] = result[num]["user"][
                                "login"]
                        if "heart" not in result[num]["reactions"]:
                            insert_dict["heart"] = None
                        else:
                            insert_dict["heart"] = result[num]["reactions"][
                                "heart"]
                        if "eyes" not in result[num]["reactions"]:
                            insert_dict["eyes"] = None
                        else:
                            insert_dict["eyes"] = result[num]["reactions"][
                                "eyes"]
                        if "rocket" not in result[num]["reactions"]:
                            insert_dict["rocket"] = None
                        else:
                            insert_dict["rocket"] = result[num]["reactions"][
                                "rocket"]
                        if "total_count" not in result[num]["reactions"]:
                            insert_dict["total_count"] = None
                        else:
                            insert_dict["total_count"] = result[num][
                                "reactions"]["total_count"]
                        if "confused" not in result[num]["reactions"]:
                            insert_dict["confused"] = None
                        else:
                            insert_dict["confused"] = result[num]["reactions"][
                                "confused"]
                        if "hooray" not in result[num]["reactions"]:
                            insert_dict["hooray"] = None
                        else:
                            insert_dict["hooray"] = result[num]["reactions"][
                                "hooray"]
                        if "+1" not in result[num]["reactions"]:
                            insert_dict["up"] = None
                        else:
                            insert_dict["up"] = result[num]["reactions"]["+1"]
                        if "laugh" not in result[num]["reactions"]:
                            insert_dict["laugh"] = None
                        else:
                            insert_dict["laugh"] = result[num]["reactions"][
                                "laugh"]
                        if "-1" not in result[num]["reactions"]:
                            insert_dict["down"] = None
                        else:
                            insert_dict["down"] = result[num]["reactions"][
                                "-1"]
                        # print "insert num: " + str(insert_dict)

                        # 0 represent issue, 1 represent pull request
                        if "pull_request" not in result[num]:
                            flag = 0
                        else:
                            flag = 1
                        print "the issue type: " + str(flag)

                        # insert data to database table
                        try:
                            if insert_dict is not None:
                                cur.execute(
                                    "insert into github_issue "
                                    "(id, number, user_login, owner_login, repo, created_at, updated_at, flag, comments, total_count, up, down, laugh, confused, heart, hooray, rocket, eyes) "
                                    "values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                                    (insert_dict["id"], insert_dict["number"],
                                     insert_dict["user_login"], owner, repo,
                                     base.time_handler(
                                         insert_dict["created_at"]),
                                     base.time_handler(
                                         insert_dict["updated_at"]), flag,
                                     insert_dict["comments"],
                                     insert_dict["total_count"],
                                     insert_dict["up"], insert_dict["down"],
                                     insert_dict["laugh"],
                                     insert_dict["confused"],
                                     insert_dict["heart"],
                                     insert_dict["hooray"],
                                     insert_dict["rocket"],
                                     insert_dict["eyes"]))
                                db.commit()
                        except Exception as e:
                            print str(e)
                        num += 1
                except urllib2.HTTPError as e:
                    print str(e.code) + " error with this page: " + url
                    if e.code != 404:
                        # mainly 403, sometimes 503
                        # token rate limit
                        self.q.put(work)  # put into the queue again
                        sleep_time_tokens[github_token] = time.time(
                        )  # set sleep time for that token
                        insert_dict = None
                    else:
                        insert_dict["body"] = "404 error"
                        insert_dict["created_at"] = None
                        insert_dict["updated_at"] = None
                else:
                    pass  # 403... error
            except Exception as e:
                print str(e)  # unexpected error, don't interrupt the program