def md2arff(): ri_obj_list = get_repo_info(to_dict=False, combine_star_events=True) repo_set = set() data = dict() data['attributes'] = attrs data['description'] = '' data['relation'] = 'readme' readme_file_set = set() inline_data = list() for ri in ri_obj_list: if (ri.repo_owner, ri.repo_name) in repo_set: continue repo_set.add((ri.repo_owner, ri.repo_name)) paper_repo_owner = getattr(ri, 'paper_repo_owner') paper_repo_name = getattr(ri, 'paper_repo_name') repo_path = os.path.join(conf.repo_path, paper_repo_owner, paper_repo_name) assert os.path.exists(repo_path) file_list = os.listdir(repo_path) readme_path = '' for f in file_list: if f.lower().startswith('readme.'): readme_path = os.path.join(repo_path, f) break if readme_path == '': readme_content = '' else: with open(readme_path, 'r', encoding='utf-8', errors='ignore') as readme_f: readme_content = readme_f.read() if readme_path != '' and f.lower() == 'readme.md': readme_content = parse_markdown(readme_content) readme_content = readme_content.lower() readme_content = readme_content.replace('\n', ' ') readme_content = readme_content.replace('\"', ' ') readme_content = readme_content.replace('\'', ' ') inline_data_unit = list() if ri.stars_count >= threshold: inline_data_unit.append('popular') else: inline_data_unit.append('unpopular') inline_data_unit.append(readme_content) inline_data.append(inline_data_unit) data['data'] = inline_data file_content = arff.dumps(data) arff_path = os.path.join(conf.root_path, 'text_analysis.arff') with open(arff_path, 'w', encoding='utf-8') as f: f.write(file_content)
def get_anomaly_repo(): repo_info_list = get_repo_info(to_dict=False) ret_map = dict() paper_data = get_papers_from_db() for ri in repo_info_list: key = (ri.repo_owner, ri.repo_name) try: ret_map[key] except KeyError: ret_map[key] = list() ret_map[key].append(ri.paper_id) for key in ret_map.keys(): if len(ret_map[key]) > 1: for pid in ret_map[key]: print(paper_data[pid-1].title)
def percentage_star_at_study_date(): """ We can first know that whether our analysis date is a good date. """ study_year = 2017 study_conf = 'CVPR' ri_obj_list = get_repo_info(to_dict=False, combine_star_events=True) repo_set = set() for ri in ri_obj_list: if (ri.repo_owner, ri.repo_name) in repo_set: continue if ri.conf != study_conf or ri.year != study_year: continue repo_set.add((ri.repo_owner, ri.repo_name)) star_events = getattr(ri, 'star_events') for se in star_events: pass
def get_data(): data = dict() data['attributes'] = attrs data['description'] = '' data['relation'] = 'abs' repo_infos = get_repo_info(combine_star_events=True) data_list = list() for ri in repo_infos: d = list() if ri.year == 2018 and ri.conf == 'NIPS': continue conference = getattr(ri, 'conf') title = getattr(ri, 'title') if conference is None or conference == '': continue if title in conf.excluded_papers: continue if ri.language == '' or ri.language is None: continue if ri.stars_count >= threshold: d.append('popular') else: d.append('unpopular') # abs_path = os.path.join(conf.paper_abs_path(), 'abstracts') abs_path = os.path.join(conf.paper_pdf_path(), 'text') this_abs_path = os.path.join(abs_path, str(ri.paper_id) + '.txt') with open(this_abs_path, 'r', encoding='ascii', errors="ignore") as f: file_content = f.read() content = file_content.lower() content = content.replace('\r', ' ') content = content.replace('\n', ' ') content = content.replace('\"', ' ') content = content.replace('\'', ' ') d.append(content) data_list.append(d) data['data'] = data_list file_content = arff.dumps(data) arff_path = os.path.join(conf.root_path, 'paper_analysis.arff') with open(arff_path, 'w', encoding='utf-8') as f: f.write(file_content) return data_list
def definite_time_stars(days=90): ri_obj_list = get_repo_info(to_dict=False) repo_set = set() statistics = 0 for ri in ri_obj_list: if (ri.repo_owner, ri.repo_name) in repo_set: continue ri.combine_with_star_events() repo_set.add((ri.repo_owner, ri.repo_name)) star_events = getattr(ri, 'star_events') good_stars = 0 for se in star_events: if (se.timestamp - ri.created_at).days < days: good_stars += 1 if len(star_events) > 0: print(good_stars / len(star_events), ri.conf, ri.created_at, ri.stars_count) if good_stars / len(star_events) > 0.5: statistics += 1 print(statistics)
def __init__(self, paper_obj): assert (isinstance(paper_obj, Paper)) self.paper_id = getattr(paper_obj, 'id') self.repo_name = paper_obj.repo_name self.repo_owner = paper_obj.repo_owner self.initial_url = 'https://api.github.com/repos/' self.page = 0 self.per_page = page_size self.end_crawl = False if not self.check_repo_in_disk(): self.end_crawl = True print('Repo %s not in disk!' % self.repo_name) if not self.end_crawl: print('Crawling data for %s\'s Repo %s' % (self.repo_owner, self.repo_name)) self.repo_info = get_repo_info(to_dict=True)[self.paper_id] # if self.check_all_data_in_disk(): # self.end_crawl = True # print('Repo %s downloaded' % self.repo_name) self.page = self.get_start_page() - 1
def get_all_text(): text_data = [] # abs_path = os.path.join(conf.paper_abs_path(), 'abstracts') abs_path = os.path.join(conf.paper_pdf_path(), 'text') repo_info = get_repo_info(combine_star_events=True) for ri in repo_info: title = getattr(ri, 'title') conference = getattr(ri, 'conf') if ri.year <= 2014: continue if ri.language == '' or ri.language is None: continue if title in conf.excluded_papers: continue if conference is None or conference == '': continue paper_id = getattr(ri, 'paper_id') f_path = os.path.join(abs_path, str(paper_id) + '.txt') with open(f_path, 'r', encoding='ascii', errors='ignore') as f_obj: file_content = f_obj.read() tokens = prepare_text_for_lda(file_content) text_data.append(tokens) return text_data
abs_attrs = AbstractTopicModelFeatures.attrs code_attrs = CodeFeatures.attrs readme_attrs = deepcopy(ReadmeFeatures.attrs) repometa_attrs = RepoMetaFeatures.attrs readme_attrs.remove('contain_docker') readme_attrs.remove('contain_data') all_attrs = abs_attrs + code_attrs + readme_attrs + repometa_attrs all_attrs.append('stars') if __name__ == '__main__': from obj.repo import get_repo_info repo_info = get_repo_info(combine_star_events=True) dict_list = list() i = 0 for ri in repo_info: print(i) if ri.year == 2014: continue conference = getattr(ri, 'conf') title = getattr(ri, 'title') if conference is None or conference == '': continue if title in conf.excluded_papers: continue if ri.language == '' or ri.language is None: continue abs_feature_dict = AbstractTopicModelFeatures(ri).to_dict()
def in_year_month(date_obj, year, month): assert (isinstance(date_obj, datetime)) if date_obj.year == year and date_obj.month == month: return True return False def extract_year_month(ri_obj): assert (isinstance(ri_obj, Repo)) return ri_obj.created_at.year, ri_obj.created_at.month if __name__ == '__main__': ri_objs = get_repo_info(to_dict=False, combine_paper=True) ri_set = set() each_month_counter = dict() study_conf = 'ICML' for ri in ri_objs: if (ri.repo_owner, ri.repo_name) in ri_set: continue if ri.conf != study_conf: continue ri_set.add((ri.repo_owner, ri.repo_name)) year, month = extract_year_month(ri) try: each_month_counter[(year, month)] except KeyError: each_month_counter[(year, month)] = 0 each_month_counter[(year, month)] += 1