def md2arff():
    ri_obj_list = get_repo_info(to_dict=False, combine_star_events=True)
    repo_set = set()
    data = dict()
    data['attributes'] = attrs
    data['description'] = ''
    data['relation'] = 'readme'
    readme_file_set = set()
    inline_data = list()
    for ri in ri_obj_list:
        if (ri.repo_owner, ri.repo_name) in repo_set:
            continue
        repo_set.add((ri.repo_owner, ri.repo_name))
        paper_repo_owner = getattr(ri, 'paper_repo_owner')
        paper_repo_name = getattr(ri, 'paper_repo_name')
        repo_path = os.path.join(conf.repo_path, paper_repo_owner, paper_repo_name)

        assert os.path.exists(repo_path)
        file_list = os.listdir(repo_path)
        readme_path = ''
        for f in file_list:
            if f.lower().startswith('readme.'):
                readme_path = os.path.join(repo_path, f)
                break
        if readme_path == '':
            readme_content = ''
        else:
            with open(readme_path, 'r', encoding='utf-8', errors='ignore') as readme_f:
                readme_content = readme_f.read()

        if readme_path != '' and f.lower() == 'readme.md':
                readme_content = parse_markdown(readme_content)

        readme_content = readme_content.lower()
        readme_content = readme_content.replace('\n', ' ')
        readme_content = readme_content.replace('\"', ' ')
        readme_content = readme_content.replace('\'', ' ')
        inline_data_unit = list()
        if ri.stars_count >= threshold:
            inline_data_unit.append('popular')
        else:
            inline_data_unit.append('unpopular')
        inline_data_unit.append(readme_content)
        inline_data.append(inline_data_unit)

    data['data'] = inline_data

    file_content = arff.dumps(data)
    arff_path = os.path.join(conf.root_path, 'text_analysis.arff')
    with open(arff_path, 'w', encoding='utf-8') as f:
        f.write(file_content)
def get_anomaly_repo():
    repo_info_list = get_repo_info(to_dict=False)
    ret_map = dict()
    paper_data = get_papers_from_db()
    for ri in repo_info_list:
        key = (ri.repo_owner, ri.repo_name)
        try:
            ret_map[key]
        except KeyError:
            ret_map[key] = list()
        ret_map[key].append(ri.paper_id)
    for key in ret_map.keys():
        if len(ret_map[key]) > 1:
            for pid in ret_map[key]:
                print(paper_data[pid-1].title)
Exemple #3
0
def percentage_star_at_study_date():
    """
    We can first know that whether our analysis date
    is a good date.
    """
    study_year = 2017
    study_conf = 'CVPR'
    ri_obj_list = get_repo_info(to_dict=False, combine_star_events=True)
    repo_set = set()
    for ri in ri_obj_list:
        if (ri.repo_owner, ri.repo_name) in repo_set:
            continue
        if ri.conf != study_conf or ri.year != study_year:
            continue
        repo_set.add((ri.repo_owner, ri.repo_name))
        star_events = getattr(ri, 'star_events')
        for se in star_events:
            pass
def get_data():
    data = dict()
    data['attributes'] = attrs
    data['description'] = ''
    data['relation'] = 'abs'
    repo_infos = get_repo_info(combine_star_events=True)
    data_list = list()
    for ri in repo_infos:
        d = list()
        if ri.year == 2018 and ri.conf == 'NIPS':
            continue
        conference = getattr(ri, 'conf')
        title = getattr(ri, 'title')
        if conference is None or conference == '':
            continue
        if title in conf.excluded_papers:
            continue
        if ri.language == '' or ri.language is None:
            continue
        if ri.stars_count >= threshold:
            d.append('popular')
        else:
            d.append('unpopular')

        # abs_path = os.path.join(conf.paper_abs_path(), 'abstracts')
        abs_path = os.path.join(conf.paper_pdf_path(), 'text')
        this_abs_path = os.path.join(abs_path, str(ri.paper_id) + '.txt')
        with open(this_abs_path, 'r', encoding='ascii', errors="ignore") as f:
            file_content = f.read()
        content = file_content.lower()
        content = content.replace('\r', ' ')
        content = content.replace('\n', ' ')
        content = content.replace('\"', ' ')
        content = content.replace('\'', ' ')
        d.append(content)
        data_list.append(d)

    data['data'] = data_list
    file_content = arff.dumps(data)
    arff_path = os.path.join(conf.root_path, 'paper_analysis.arff')
    with open(arff_path, 'w', encoding='utf-8') as f:
        f.write(file_content)
    return data_list
Exemple #5
0
def definite_time_stars(days=90):
    ri_obj_list = get_repo_info(to_dict=False)
    repo_set = set()
    statistics = 0
    for ri in ri_obj_list:
        if (ri.repo_owner, ri.repo_name) in repo_set:
            continue
        ri.combine_with_star_events()
        repo_set.add((ri.repo_owner, ri.repo_name))
        star_events = getattr(ri, 'star_events')
        good_stars = 0
        for se in star_events:
            if (se.timestamp - ri.created_at).days < days:
                good_stars += 1
        if len(star_events) > 0:
            print(good_stars / len(star_events), ri.conf, ri.created_at,
                  ri.stars_count)
            if good_stars / len(star_events) > 0.5:
                statistics += 1
    print(statistics)
    def __init__(self, paper_obj):
        assert (isinstance(paper_obj, Paper))
        self.paper_id = getattr(paper_obj, 'id')
        self.repo_name = paper_obj.repo_name
        self.repo_owner = paper_obj.repo_owner
        self.initial_url = 'https://api.github.com/repos/'
        self.page = 0
        self.per_page = page_size
        self.end_crawl = False

        if not self.check_repo_in_disk():
            self.end_crawl = True
            print('Repo %s not in disk!' % self.repo_name)

        if not self.end_crawl:
            print('Crawling data for %s\'s Repo %s' %
                  (self.repo_owner, self.repo_name))
            self.repo_info = get_repo_info(to_dict=True)[self.paper_id]
            # if self.check_all_data_in_disk():
            #     self.end_crawl = True
            #     print('Repo %s downloaded' % self.repo_name)
            self.page = self.get_start_page() - 1
def get_all_text():
    text_data = []
    # abs_path = os.path.join(conf.paper_abs_path(), 'abstracts')
    abs_path = os.path.join(conf.paper_pdf_path(), 'text')
    repo_info = get_repo_info(combine_star_events=True)
    for ri in repo_info:
        title = getattr(ri, 'title')
        conference = getattr(ri, 'conf')
        if ri.year <= 2014:
            continue
        if ri.language == '' or ri.language is None:
            continue
        if title in conf.excluded_papers:
            continue
        if conference is None or conference == '':
            continue
        paper_id = getattr(ri, 'paper_id')
        f_path = os.path.join(abs_path, str(paper_id) + '.txt')
        with open(f_path, 'r', encoding='ascii', errors='ignore') as f_obj:
            file_content = f_obj.read()
        tokens = prepare_text_for_lda(file_content)
        text_data.append(tokens)
    return text_data
Exemple #8
0
abs_attrs = AbstractTopicModelFeatures.attrs
code_attrs = CodeFeatures.attrs
readme_attrs = deepcopy(ReadmeFeatures.attrs)
repometa_attrs = RepoMetaFeatures.attrs

readme_attrs.remove('contain_docker')
readme_attrs.remove('contain_data')

all_attrs = abs_attrs + code_attrs + readme_attrs + repometa_attrs
all_attrs.append('stars')

if __name__ == '__main__':
    from obj.repo import get_repo_info

    repo_info = get_repo_info(combine_star_events=True)
    dict_list = list()
    i = 0
    for ri in repo_info:
        print(i)
        if ri.year == 2014:
            continue
        conference = getattr(ri, 'conf')
        title = getattr(ri, 'title')
        if conference is None or conference == '':
            continue
        if title in conf.excluded_papers:
            continue
        if ri.language == '' or ri.language is None:
            continue
        abs_feature_dict = AbstractTopicModelFeatures(ri).to_dict()

def in_year_month(date_obj, year, month):
    assert (isinstance(date_obj, datetime))
    if date_obj.year == year and date_obj.month == month:
        return True
    return False


def extract_year_month(ri_obj):
    assert (isinstance(ri_obj, Repo))
    return ri_obj.created_at.year, ri_obj.created_at.month


if __name__ == '__main__':
    ri_objs = get_repo_info(to_dict=False, combine_paper=True)
    ri_set = set()
    each_month_counter = dict()
    study_conf = 'ICML'
    for ri in ri_objs:
        if (ri.repo_owner, ri.repo_name) in ri_set:
            continue
        if ri.conf != study_conf:
            continue
        ri_set.add((ri.repo_owner, ri.repo_name))
        year, month = extract_year_month(ri)
        try:
            each_month_counter[(year, month)]
        except KeyError:
            each_month_counter[(year, month)] = 0
        each_month_counter[(year, month)] += 1