def get_owner_repo_map(): """ We use repo owner and repo name to distinguish one repo from another. """ repo_info_list = get_repo_info(to_dict=False) ret_map = dict() for ri in repo_info_list: key = (ri.repo_owner, ri.repo_name) try: ret_map[key] except KeyError: ret_map[key] = list() ret_map[key].append(ri.paper_id) paper_data = get_papers_from_db() paper_map = dict() for pd in paper_data: paper_map[pd.id] = (pd.repo_owner, pd.repo_name) for (o, n) in ret_map.keys(): pids = ret_map[(o, n)] for pid in pids: if paper_map[pid] != (o, n): print(o, n), print(paper_map[pid]) return ret_map
def crawl_repo_stargazer(): papers = get_papers_from_db() for p in papers: s = StargazerCrawler(p) while not s.end_crawl: result_json = s.get_next_page() s.result_to_disk(result_json) time.sleep(0.5)
def combine_with_paper(self): paper_data = get_papers_from_db(with_citation=True) paper_obj = paper_data[self.paper_id - 1] setattr(self, 'title', paper_obj.title) setattr(self, 'conf', paper_obj.get_conf()) setattr(self, 'year', paper_obj.year) setattr(self, 'citation', getattr(paper_obj, 'citation')) setattr(self, 'paper_repo_owner', paper_obj.repo_owner) setattr(self, 'paper_repo_name', paper_obj.repo_name)
def crawl_main(): paper_data = get_papers_from_db() for pd in paper_data: print(pd.id) link = pd.link link2 = preprocess_url(link) website, save_path = get_paper_path(link2, pd.id) response = requests.get(link2) with open(save_path, 'w', encoding='utf-8') as f: f.write(response.text)
def crawl_main(): paper_data = get_papers_from_db() for pd in paper_data: print(pd.id) link = pd.link link2 = preprocess_url(link) save_path = get_paper_path(pd.id) if os.path.exists(save_path): continue print(pd.title) print(link2) if 'content_iccv' in link2: link2 = link2.replace('content_iccv', 'content_ICCV') if link2 != 'Hello': download_file(link2, save_path)
def get_anomaly_repo(): repo_info_list = get_repo_info(to_dict=False) ret_map = dict() paper_data = get_papers_from_db() for ri in repo_info_list: key = (ri.repo_owner, ri.repo_name) try: ret_map[key] except KeyError: ret_map[key] = list() ret_map[key].append(ri.paper_id) for key in ret_map.keys(): if len(ret_map[key]) > 1: for pid in ret_map[key]: print(paper_data[pid-1].title)
def store_repo_info(): paper_data = get_papers_from_db() db_objs = list() for pd in paper_data: pd_id = getattr(pd, 'id') print(pd_id) r = Repo(pd) r.from_json() r.paper_id = pd_id if r.stars_count is None: continue db_objs.append(r.to_db_obj()) db_api = DataBaseApi() db_api.insert_objs(db_objs) db_api.close_session()
def store_star_event(): """ For each repo: find all pages of json data; for each page of json data: for each json_obj in the page: store it into db. """ paper_data = get_papers_from_db() db_obj_list = list() i = 1 repo_set = set() for pd in paper_data: if (pd.repo_owner, pd.repo_name) in repo_set: continue print(i) repo_set.add((pd.repo_owner, pd.repo_name)) json_dir_path = os.path.join(conf.star_path, pd.repo_owner, pd.repo_name) if not os.path.exists(json_dir_path): continue file_names = os.listdir(json_dir_path) file_number = len(file_names) j = 1 while j <= file_number: json_path = os.path.join(json_dir_path, str(j) + '.json') j += 1 with open(json_path, 'r', encoding='utf-8') as f: json_obj_list = simplejson.load(f) for json_obj in json_obj_list: se = StarEvent(pd) se.from_json_obj(json_obj) if se.timestamp is not None: db_obj_list.append(se.to_db_obj()) if len(db_obj_list) == 20000: db_api = DataBaseApi() db_api.insert_objs(db_obj_list) db_api.close_session() db_obj_list = list() i += 1 if len(db_obj_list) > 0: db_api = DataBaseApi() db_api.insert_objs(db_obj_list) db_api.close_session()
def move_project_to_owner_dir(): papers = get_papers_from_db() repo_num_map = dict() for pd in papers: repo_name = pd.repo_name try: repo_num_map[repo_name] except KeyError: repo_num_map[repo_name] = 0 repo_num_map[repo_name] += 1 for pd in papers: repo_name = pd.repo_name repo_owner = pd.repo_owner old_repo_path = os.path.join(conf.root_path, 'repos', repo_name) if repo_num_map[repo_name] == 1 and os.path.exists(old_repo_path): repo_owner_path = os.path.join(conf.repo_path, repo_owner) if not os.path.exists(repo_owner_path): os.makedirs(repo_owner_path) new_repo_path = os.path.join(repo_owner_path, repo_name) os.rename(old_repo_path, new_repo_path) else: clone_repos(pd)
readme_path = os.path.join(repo_path, 'readme.md') with open(readme_path, 'r', encoding='utf-8', errors='ignore') as f: self.readme_content = f.read() self.html_soup = None class MarkDownReadme(Readme): def to_html(self): readme_html = markdown(self.readme_content) html_soup = BeautifulSoup(readme_html, 'html.parser') self.html_soup = html_soup def parse_readme_html(self): header_index = range(1, 7) for hi in header_index: headers = self.html_soup.find_all('h'+str(hi)) if len(headers) == 0: continue else: pass if __name__ == '__main__': paper_data = get_papers_from_db() repo_set = set() for pd in paper_data: if (pd.repo_owner, pd.repo_name) in repo_set: continue repo_set.add((pd.repo_owner, pd.repo_name)) mdr = MarkDownReadme(pd.repo_owner, pd.repo_name)
def crawl_repo_info(): papers = get_papers_from_db() for p in papers: r = RepoInfoCrawler(p) r.crawl_to_disk()
from obj.paper import get_papers_from_db from configuration import conf import os import re paper_data = get_papers_from_db(with_citation=True) i = 1 for pd in paper_data: repo_owner = pd.repo_owner repo_name = pd.repo_name repo_path = os.path.join(conf.repo_path, repo_owner, repo_name) if i > 378: print(pd.title, pd.code_link) print(pd.repo_owner) print(pd.link) if os.path.exists(repo_path): file_list = os.listdir(repo_path) readme_path = '' for f in file_list: if f.lower().startswith('readme.'): readme_path = os.path.join(repo_path, f) if readme_path == '': readme_content = '' else: with open(readme_path, 'r', encoding='utf-8', errors='ignore') as readme_f: readme_content = readme_f.read() temp = readme_content.lower() repo_desc = pd.get_repo_desc()