def main(): args = get_args() token = args.token page_num = args.page_num count_threshold = args.count_threshold file_name = args.file_name h = common.get_header(token) url = "https://qiita.com/api/v2/tags?page={}&per_page=100&sort=count" with open("./tags/{}.tsv".format(file_name), "w") as f: for page in range(1, page_num + 1): print("----------- page={}".format(page)) res = requests.get(url.format(page).encode("utf-8"), headers=h) data = res.text d = json.loads(data) if res.status_code != 200: print("Response status code: {}".format(res.status_code)) break for item in d: if item["items_count"] >= count_threshold: data_written = "{}\t{}\t{}\n".format( item["id"], item["followers_count"], item["items_count"]) f.write(data_written)
def main(): h = common.get_header() url = "https://qiita.com/api/v2/items?page=1&per_page=1" res = requests.get(url, headers=h) total_count = int(res.headers['Total-Count']) print("Total Count of articles: {}".format(total_count))
def get_pagination_urls(self, base_url): """获取采集项目数据信息的分页URL""" url = base_url + '1' r = requests.get(url, headers=common.get_header()) soup = Bs(r.text) page_data = Bs(str( soup.find_all(class_='ProjectList'))).find_all(class_='stat') max_page = math.ceil( int(page_data[0].next_sibling.next_sibling.text) / 20) return (base_url + str(i) for i in range(1, max_page + 1))
def parse_project_baseinfo(self, url): """解析库的基本信息""" r = requests.get(url, headers=common.get_header()) project_data = Bs(str(Bs(r.text).find_all(class_='ProjectList'))) list_data = project_data.find_all(class_='List') page = project_data.find_all( class_='stat')[0].next_sibling.next_sibling.text projects = [] for w in Bs(str(list_data)).find_all('h3'): link = w.select('a') link = 'http://www.oschina.net' + Bs(str(link[0])).a.get('href') projects.append({'url': link}) return projects
def parse_proj_detail_info(self, url): """解析项目详细信息""" r = requests.get(url, headers=common.get_header()) item = [None, None] soup = Bs(r.text) attrs_data = str(soup.find_all(class_='attrs')) if soup.find(class_='name'): name = soup.find(class_='name').u.text else: return item if attrs_data.find('年') and attrs_data.find('月'): item = [ attrs_data[attrs_data.find('年') - 4:attrs_data.find('月')].replace('年', '-').replace( '月', '-'), name ] print(item[0], '\t', item[1]) return item
args = parser.parse_args() # main module if __name__ == '__main__': # Run try: logging.info(f"Loading model {args.model}") model = importlib.import_module('models.' + args.model) except ModuleNotFoundError: logging.error(f"Model {args.model} not found.") quit() if args.train: logging.info(f"Running training benchmark for {args.model}...") logging.info(common.get_header()) logging.info(common.get_underline()) batch_size = 10 while batch_size <= args.observations: total_times, observation_times = model.run_training(batch_size) stats = common.calculate_stats(observation_times) logging.info(common.format_stats(batch_size, stats)) batch_size *= 10 # else: # logging.info(f"Running testing benchmark for {args.model}...") # logging.info(common.STATS) # batch_size = 1 # while batch_size <= args.observations: # model.run_inference(batch_size) # batch_size *= 10 else:
def main(): args = get_args() token = args.token selected_tag = args.tag output_path = args.output page_num, per_page = common.get_correct_page_counts(args.page_num, args.per_page) h = common.get_header(token) url = "https://qiita.com/api/v2/items?page={}&per_page={}&query=tag%3A{}" # for preventing from duplication of the same articles already_counted_id_list = [] if check_output_exists(output_path): df = pd.read_table(output_path) print(df.head(5)) already_counted_id_list = df["id"].values # create new output file if it does not exist. if not os.path.exists(output_path): with open(output_path, "w") as f: f.write("id\ttags\n") # Qiita API call with open(output_path, "a") as f: print("----- TAG: {} -----".format(selected_tag)) for page in range(1, page_num + 1): api_req = url.format(page, per_page, selected_tag) print(api_req) res = requests.get(api_req, headers=h) data = res.text if res.status_code != 200: print("Status code: {}".format(res.status_code)) break if data is None or data is '': continue try: d = json.loads(data) for d_item in d: # ignoring already saved article article_id = d_item["id"] if article_id in already_counted_id_list: print("Skip article with id: {}".format(article_id)) continue # tag list -> tag concatenated by "," tags = d_item["tags"] tag_list = [] for tag in tags: single_tag = tag["name"] tag_list.append(single_tag) tags_str = ",".join(tag_list) #print("id={}, tag_list={}".format(article_id, tags_str)) f.write("{}\t{}\n".format(article_id, tags_str)) # body body = d_item["body"] with open("./docs/{}.md".format(article_id), "w") as doc_f: doc_f.write(body) except Exception: pass