def main():
    parser = argparse.ArgumentParser(
        description='Deduplicate crawled repositories for a language')
    parser.add_argument(
        '-l',
        '--language',
        help=
        'The programming language to be collected (hint: replace spaces by +)',
        required=True)
    parser.add_argument('--start_date',
                        default=None,
                        help='The start date for filtering (if necessary)',
                        required=False)
    parser.add_argument('--end_date',
                        default=None,
                        help='The end date for filtering (if necessary)',
                        required=False)

    args = parser.parse_args()

    # Print start time processing
    start_time = datetime.now()
    print(f'Job stated at {start_time}\n')

    main_path = os.path.join(utils.get_main_path(), 'data', 'crawler',
                             'repositories', args.language.lower())

    # Read all the files, concat repositories and deduplicate them
    repositories_df = read_all_files(main_path)
    deduplicated_repositories_df = deduplicate_repositories(repositories_df)

    # Print processed data
    print('\nShape of complete dataframe:', repositories_df.shape)
    print('Shape of deduplicated dataframe:',
          deduplicated_repositories_df.shape)

    # Save the deduplicated file
    save_deduplicated_file(main_path, deduplicated_repositories_df)

    # Filter the dataframe if necessary
    if args.start_date and args.end_date:
        # it is not necessary filter the end_date because the filter will be applyed on commits
        deduplicated_repositories_df = deduplicated_repositories_df[
            deduplicated_repositories_df['updated_at'] >= args.start_date]
        print('\nShape of filtered dataframe:',
              deduplicated_repositories_df.shape)

        # Save the deduplicated file after filtering
        save_deduplicated_file(main_path,
                               deduplicated_repositories_df,
                               filtered=True)

    # Print finish time processing
    end_time = datetime.now()
    print(f'\nJob finished at {end_time}\n')

    print('>> Job finished in', end_time - start_time, '<<')
Beispiel #2
0
def load_spider_settings():
    import glob
    import sys
    from utils import get_main_path
    from scrapy import log
    path = get_main_path()
    if path not in sys.path:
        sys.path.append(path)
    for i in glob.glob('spiders/*'):
        if i.split('/')[1] in SPIDER_NAME:
            print('load settings from %s/settings.py' % i)
            module = i.replace('/', '.') + '.settings'
            __import__(module)
            spider_settings = sys.modules[module]
            from pprint import pprint as p
            for key, val in spider_settings.Settings.__dict__.items():
                if not key.startswith('__'):
                    globals()[key] = val
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Divide the main file of repositores into small ones')
    parser.add_argument(
        '-l',
        '--language',
        help=
        'The programming language to be collected (hint: replace spaces by +)',
        required=True)
    parser.add_argument('--part-size',
                        default=10000,
                        help='Number of repositories in each new file',
                        required=False)
    parser.add_argument(
        '--ignore',
        default=True,
        help='Ignore files already collected due the log metadata')

    args = parser.parse_args()

    # Print start time processing
    start_time = datetime.now()
    print(f'Job stated at {start_time}\n')

    main_path = os.path.join(utils.get_main_path(), 'data', 'crawler')

    repositories_list = []

    if args.ignore.lower() == 'true':
        repositories_list = read_existing_log_metadata(main_path,
                                                       args.language)

    # Read the main file and divide it into partitions
    divide_repositories_file(main_path, args.language, int(args.part_size),
                             repositories_list)

    # Print finish time processing
    end_time = datetime.now()
    print(f'\nJob finished at {end_time}\n')

    print('>> Job finished in', end_time - start_time, '<<')
Beispiel #4
0
def create_progress_file(in_progress, language, separator=','):
    file_name = os.path.join(utils.get_main_path(), 'data', 'crawler', 'repositories', language.lower(), f'crawling_repositories_metadata.csv')
    
    last_updated_date = None

    if not in_progress:
        # create the metadata file with the header
        with open(file_name, mode='w', newline='') as w:
            csv_file = csv.writer(w, delimiter=separator)
            csv_file.writerow(['log_date', 'language', 'stars', 'created_at', 'updated_at', 'page', 
                               'total_count', 'incomplete_results', 'complete_query'])
        w.close()
    else:
        # open the file and get the last date crawled
        # the logs for that date have to be excluded
        log_file_df = pd.read_csv(file_name, sep=separator)
        last_updated_date = max(log_file_df['updated_at'])

        log_file_df = log_file_df[log_file_df['updated_at'] != last_updated_date]
        log_file_df.to_csv(file_name, sep=',', index=False)

    return file_name, last_updated_date
def main():
    parser = argparse.ArgumentParser(description='Stars collector from Github')
    parser.add_argument('-t',
                        '--token',
                        help='The Github token identifier to crawling data',
                        required=True)
    parser.add_argument(
        '-l',
        '--language',
        help=
        'The programming language to be collected (hint: replace spaces by +)',
        required=True)
    parser.add_argument(
        '-d',
        '--date',
        default='2019-12-01',
        help='The start date for crawling (format: YYYY-MM-DD)',
        required=False)
    parser.add_argument(
        '--cont',
        default=False,
        help=
        'Use this param with True value to continue a started crawling in a specific language',
        required=False)
    parser.add_argument(
        '--reprocess',
        '-r',
        default=False,
        help='Reprocess queries for the incomplete results in the first time',
        required=False)

    args = parser.parse_args()

    # Print start time processing
    start_time = datetime.now()
    print(f'Crawling stated at {start_time}\n')

    # Get token by key
    token = utils.get_token_key(args.token)
    print(f'Token successfully obtained using token key {args.token}\n')

    stars_file_path = os.path.join(utils.get_main_path(), 'data', 'crawler',
                                   'stars',
                                   f'{args.language}_stars_histogram.csv')

    if not args.reprocess:
        # Recover the last number of stars or inicialize a new file
        if args.cont.lower() == 'true':
            init_star = get_crawling_progress(stars_file_path)
        else:
            init_star = 0
            create_replace_stars_file(stars_file_path)

        # Get max stars for language
        max_stars = get_max_stars(token, args.language, args.date)

        # Save the histogram of repositories by stars
        save_stars_histogram(token, args.language, args.date, init_star,
                             max_stars, stars_file_path)
        print(f'\nStars file successfully saved on {stars_file_path}\n')
    else:
        # Reprocess the histogram of repositories by stars
        stars_reprocessed_file_path = stars_file_path.replace(
            '.csv', '_reprocessed.csv')
        reprocess_stars_histogram(token, args.language, args.date,
                                  stars_file_path)
        print(
            f'\nStars reprocessed file successfully saved on {stars_reprocessed_file_path}\n'
        )

    # Print finish time processing
    end_time = datetime.now()
    print(f'Crawling finished at {end_time}\n')

    print('>> Crawling finished in', end_time - start_time, '<<')
Beispiel #6
0
def get_repositories_by_time(token, metadata_path, language, start_date, end_date=None):
    q_language = f'%3A\"{language}\"'
    q_per_page = '&per_page=100'

    # path to save crawling files
    crawler_path = os.path.join(utils.get_main_path(), 'data', 'crawler', 'repositories', language.lower(), 'daily_crawler')

    # original query based on language and stars
    base_query = f'https://api.github.com/search/repositories?q=stars%3A>0+created%3A>2010-01-01+language{q_language}'

    if not end_date:
        # if end date is not given, use the current date
        end_date = datetime.now().strftime('%Y-%m-%d')

    # get all the dates for crawling
    days = list(pd.date_range(start_date, end_date, freq='d'))
    str_days = [d.strftime('%Y-%m-%d') for d in days]

    for date in str_days:
        q_date = f'%3A{date}'

        date_query = base_query + f'+pushed{q_date}'

        r_date = requests.get(date_query, headers={'Authorization': 'token %s' % token}) 
        data = json.loads(r_date.content)
        total_count = data['total_count']

        print(f'Requesting repositories for {date} - {total_count} results')

        # verify the request time from API
        api.verify_request_time(token, 'search')

        if total_count <= 1000:
            page = 1
            while data['items'] and page <= 10:
                print(f'Requesting repositories for {date} - page {page}')

                q_page = f'&page={page}'
                complete_query = date_query + q_per_page + q_page

                file_crawler_path = os.path.join(crawler_path, f'{language.lower()}_{date}_2010_{page}.csv')

                data = save_result_query(token, complete_query, file_crawler_path)
                
                # log progress
                save_progress_metadata(metadata_path, language, 0, '2020-01-01', date, page,
                                       data['total_count'], data['incomplete_results'], complete_query)

                page = page + 1

        else:
            for year in range(2010, int(end_date[:4]) + 1):
                q_creation_date = f'%3A{year}-01-01..{year}-12-31'

                # new partitions by creation date
                new_date_query = date_query.replace('+created%3A>2010-01-01', f'+created{q_creation_date}')

                r_date = requests.get(new_date_query, headers={'Authorization': 'token %s' % token}) 
                data = json.loads(r_date.content)
                total_count = data['total_count']

                print(f'Requesting repositories for {date} and creation year {year} - {total_count} results')

                monthly_dividing = False

                if total_count > 1000:
                    monthly_dividing = True

                # verify the request time from API
                api.verify_request_time(token, 'search')

                if not monthly_dividing:
                    page = 1
                    while data['items'] and page <= 10:
                        print(f'Requesting repositories for {date} and creation year {year} - page {page}')

                        q_page = f'&page={page}'
                        new_complete_query = new_date_query + q_per_page + q_page
                        
                        new_file_crawler_path = os.path.join(crawler_path, f'{language.lower()}_{date}_{year}_{page}.csv')

                        data = save_result_query(token, new_complete_query, new_file_crawler_path)
                        
                        # log progress
                        save_progress_metadata(metadata_path, language, 0, f'{year}-01-01..{year}-12-31', date, page,
                                            data['total_count'], data['incomplete_results'], new_complete_query)

                        page = page + 1
                else:
                    month_groups = {1: ('01-01', '03-31'),
                                    2: ('04-01', '06-30'),
                                    3: ('07-01', '08-31'),
                                    4: ('09-01', '10-31'),
                                    5: ('11-01', '11-30'),
                                    6: ('12-01', '12-31')}
                    for month in month_groups:
                        q_creation_date = f'%3A{year}-{month_groups[month][0]}..{year}-{month_groups[month][1]}'

                        # new partitions by creation date
                        monthly_date_query = date_query.replace('+created%3A>2010-01-01', f'+created{q_creation_date}')

                        r_date = requests.get(monthly_date_query, headers={'Authorization': 'token %s' % token}) 
                        data = json.loads(r_date.content)
                        total_count = data['total_count']

                        print(f'Requesting repositories for {date} and creation year {year} - monthly division {month} - {total_count} results')

                        # verify the request time from API
                        api.verify_request_time(token, 'search')

                        page = 1
                        while data['items'] and page <= 10:
                            print(f'Requesting repositories for {date} and creation year {year} - monthly division {month} - page {page}')

                            q_page = f'&page={page}'
                            monthly_complete_query = monthly_date_query + q_per_page + q_page
                            
                            monthly_file_crawler_path = os.path.join(crawler_path, f'{language.lower()}_{date}_{year}_{month}_{page}.csv')

                            data = save_result_query(token, monthly_complete_query, monthly_file_crawler_path)
                            
                            # log progress
                            save_progress_metadata(metadata_path, language, 0, 
                                                   f'{year}-{month_groups[month][0]}..{year}-{month_groups[month][1]}', 
                                                   date, page, data['total_count'], data['incomplete_results'], monthly_complete_query)

                            page = page + 1