Esempio n. 1
0
def main():
    args = get_args()
    token = args.token
    page_num = args.page_num
    count_threshold = args.count_threshold
    file_name = args.file_name

    h = common.get_header(token)
    url = "https://qiita.com/api/v2/tags?page={}&per_page=100&sort=count"

    with open("./tags/{}.tsv".format(file_name), "w") as f:
        for page in range(1, page_num + 1):
            print("----------- page={}".format(page))
            res = requests.get(url.format(page).encode("utf-8"), headers=h)
            data = res.text
            d = json.loads(data)

            if res.status_code != 200:
                print("Response status code: {}".format(res.status_code))
                break

            for item in d:
                if item["items_count"] >= count_threshold:
                    data_written = "{}\t{}\t{}\n".format(
                        item["id"], item["followers_count"],
                        item["items_count"])
                    f.write(data_written)
Esempio n. 2
0
def main():
    h = common.get_header()
    url = "https://qiita.com/api/v2/items?page=1&per_page=1"
    res = requests.get(url, headers=h)
    total_count = int(res.headers['Total-Count'])

    print("Total Count of articles: {}".format(total_count))
    def get_pagination_urls(self, base_url):
        """获取采集项目数据信息的分页URL"""

        url = base_url + '1'
        r = requests.get(url, headers=common.get_header())

        soup = Bs(r.text)
        page_data = Bs(str(
            soup.find_all(class_='ProjectList'))).find_all(class_='stat')
        max_page = math.ceil(
            int(page_data[0].next_sibling.next_sibling.text) / 20)

        return (base_url + str(i) for i in range(1, max_page + 1))
    def parse_project_baseinfo(self, url):
        """解析库的基本信息"""

        r = requests.get(url, headers=common.get_header())

        project_data = Bs(str(Bs(r.text).find_all(class_='ProjectList')))
        list_data = project_data.find_all(class_='List')
        page = project_data.find_all(
            class_='stat')[0].next_sibling.next_sibling.text

        projects = []
        for w in Bs(str(list_data)).find_all('h3'):
            link = w.select('a')
            link = 'http://www.oschina.net' + Bs(str(link[0])).a.get('href')
            projects.append({'url': link})

        return projects
    def parse_proj_detail_info(self, url):
        """解析项目详细信息"""
        r = requests.get(url, headers=common.get_header())
        item = [None, None]

        soup = Bs(r.text)
        attrs_data = str(soup.find_all(class_='attrs'))

        if soup.find(class_='name'):
            name = soup.find(class_='name').u.text
        else:
            return item

        if attrs_data.find('年') and attrs_data.find('月'):
            item = [
                attrs_data[attrs_data.find('年') -
                           4:attrs_data.find('月')].replace('年', '-').replace(
                               '月', '-'), name
            ]
            print(item[0], '\t', item[1])
        return item
Esempio n. 6
0
args = parser.parse_args()


# main module
if __name__ == '__main__':
    # Run
    try:
        logging.info(f"Loading model {args.model}")
        model = importlib.import_module('models.' + args.model)
    except ModuleNotFoundError:
        logging.error(f"Model {args.model} not found.")
        quit()

    if args.train:
        logging.info(f"Running training benchmark for {args.model}...")
        logging.info(common.get_header())
        logging.info(common.get_underline())
        batch_size = 10
        while batch_size <= args.observations:
            total_times, observation_times = model.run_training(batch_size)
            stats = common.calculate_stats(observation_times)
            logging.info(common.format_stats(batch_size, stats))
            batch_size *= 10
    # else:
    #     logging.info(f"Running testing benchmark for {args.model}...")
    #     logging.info(common.STATS)
    #     batch_size = 1
    #     while batch_size <= args.observations:
    #         model.run_inference(batch_size)
    #         batch_size *= 10
else:
Esempio n. 7
0
def main():
    args = get_args()
    token = args.token
    selected_tag = args.tag
    output_path = args.output
    page_num, per_page = common.get_correct_page_counts(args.page_num, args.per_page)

    h = common.get_header(token)
    url = "https://qiita.com/api/v2/items?page={}&per_page={}&query=tag%3A{}"

    # for preventing from duplication of the same articles
    already_counted_id_list = []
    if check_output_exists(output_path):
        df = pd.read_table(output_path)
        print(df.head(5))
        already_counted_id_list = df["id"].values

    # create new output file if it does not exist.
    if not os.path.exists(output_path):
        with open(output_path, "w") as f:
            f.write("id\ttags\n")

    # Qiita API call
    with open(output_path, "a") as f:
        print("----- TAG: {} -----".format(selected_tag))
        for page in range(1, page_num + 1):
            api_req = url.format(page, per_page, selected_tag)
            print(api_req)
            res = requests.get(api_req, headers=h)
            data = res.text

            if res.status_code != 200:
                print("Status code: {}".format(res.status_code))
                break

            if data is None or data is '':
                continue
            
            try:
                d = json.loads(data)
                for d_item in d:
                    # ignoring already saved article
                    article_id = d_item["id"]
                    if article_id in already_counted_id_list:
                        print("Skip article with id: {}".format(article_id))
                        continue

                    # tag list -> tag concatenated by ","
                    tags = d_item["tags"]
                    tag_list = []
                    for tag in tags:
                        single_tag = tag["name"]
                        tag_list.append(single_tag)
                    tags_str = ",".join(tag_list)
                    #print("id={}, tag_list={}".format(article_id, tags_str))
                    f.write("{}\t{}\n".format(article_id, tags_str))

                    # body
                    body = d_item["body"]
                    with open("./docs/{}.md".format(article_id), "w") as doc_f:
                        doc_f.write(body)
            
            except Exception:
                pass