def main(): parser = argparse.ArgumentParser( description='Deduplicate crawled repositories for a language') parser.add_argument( '-l', '--language', help= 'The programming language to be collected (hint: replace spaces by +)', required=True) parser.add_argument('--start_date', default=None, help='The start date for filtering (if necessary)', required=False) parser.add_argument('--end_date', default=None, help='The end date for filtering (if necessary)', required=False) args = parser.parse_args() # Print start time processing start_time = datetime.now() print(f'Job stated at {start_time}\n') main_path = os.path.join(utils.get_main_path(), 'data', 'crawler', 'repositories', args.language.lower()) # Read all the files, concat repositories and deduplicate them repositories_df = read_all_files(main_path) deduplicated_repositories_df = deduplicate_repositories(repositories_df) # Print processed data print('\nShape of complete dataframe:', repositories_df.shape) print('Shape of deduplicated dataframe:', deduplicated_repositories_df.shape) # Save the deduplicated file save_deduplicated_file(main_path, deduplicated_repositories_df) # Filter the dataframe if necessary if args.start_date and args.end_date: # it is not necessary filter the end_date because the filter will be applyed on commits deduplicated_repositories_df = deduplicated_repositories_df[ deduplicated_repositories_df['updated_at'] >= args.start_date] print('\nShape of filtered dataframe:', deduplicated_repositories_df.shape) # Save the deduplicated file after filtering save_deduplicated_file(main_path, deduplicated_repositories_df, filtered=True) # Print finish time processing end_time = datetime.now() print(f'\nJob finished at {end_time}\n') print('>> Job finished in', end_time - start_time, '<<')
def load_spider_settings(): import glob import sys from utils import get_main_path from scrapy import log path = get_main_path() if path not in sys.path: sys.path.append(path) for i in glob.glob('spiders/*'): if i.split('/')[1] in SPIDER_NAME: print('load settings from %s/settings.py' % i) module = i.replace('/', '.') + '.settings' __import__(module) spider_settings = sys.modules[module] from pprint import pprint as p for key, val in spider_settings.Settings.__dict__.items(): if not key.startswith('__'): globals()[key] = val
def main(): parser = argparse.ArgumentParser( description='Divide the main file of repositores into small ones') parser.add_argument( '-l', '--language', help= 'The programming language to be collected (hint: replace spaces by +)', required=True) parser.add_argument('--part-size', default=10000, help='Number of repositories in each new file', required=False) parser.add_argument( '--ignore', default=True, help='Ignore files already collected due the log metadata') args = parser.parse_args() # Print start time processing start_time = datetime.now() print(f'Job stated at {start_time}\n') main_path = os.path.join(utils.get_main_path(), 'data', 'crawler') repositories_list = [] if args.ignore.lower() == 'true': repositories_list = read_existing_log_metadata(main_path, args.language) # Read the main file and divide it into partitions divide_repositories_file(main_path, args.language, int(args.part_size), repositories_list) # Print finish time processing end_time = datetime.now() print(f'\nJob finished at {end_time}\n') print('>> Job finished in', end_time - start_time, '<<')
def create_progress_file(in_progress, language, separator=','): file_name = os.path.join(utils.get_main_path(), 'data', 'crawler', 'repositories', language.lower(), f'crawling_repositories_metadata.csv') last_updated_date = None if not in_progress: # create the metadata file with the header with open(file_name, mode='w', newline='') as w: csv_file = csv.writer(w, delimiter=separator) csv_file.writerow(['log_date', 'language', 'stars', 'created_at', 'updated_at', 'page', 'total_count', 'incomplete_results', 'complete_query']) w.close() else: # open the file and get the last date crawled # the logs for that date have to be excluded log_file_df = pd.read_csv(file_name, sep=separator) last_updated_date = max(log_file_df['updated_at']) log_file_df = log_file_df[log_file_df['updated_at'] != last_updated_date] log_file_df.to_csv(file_name, sep=',', index=False) return file_name, last_updated_date
def main(): parser = argparse.ArgumentParser(description='Stars collector from Github') parser.add_argument('-t', '--token', help='The Github token identifier to crawling data', required=True) parser.add_argument( '-l', '--language', help= 'The programming language to be collected (hint: replace spaces by +)', required=True) parser.add_argument( '-d', '--date', default='2019-12-01', help='The start date for crawling (format: YYYY-MM-DD)', required=False) parser.add_argument( '--cont', default=False, help= 'Use this param with True value to continue a started crawling in a specific language', required=False) parser.add_argument( '--reprocess', '-r', default=False, help='Reprocess queries for the incomplete results in the first time', required=False) args = parser.parse_args() # Print start time processing start_time = datetime.now() print(f'Crawling stated at {start_time}\n') # Get token by key token = utils.get_token_key(args.token) print(f'Token successfully obtained using token key {args.token}\n') stars_file_path = os.path.join(utils.get_main_path(), 'data', 'crawler', 'stars', f'{args.language}_stars_histogram.csv') if not args.reprocess: # Recover the last number of stars or inicialize a new file if args.cont.lower() == 'true': init_star = get_crawling_progress(stars_file_path) else: init_star = 0 create_replace_stars_file(stars_file_path) # Get max stars for language max_stars = get_max_stars(token, args.language, args.date) # Save the histogram of repositories by stars save_stars_histogram(token, args.language, args.date, init_star, max_stars, stars_file_path) print(f'\nStars file successfully saved on {stars_file_path}\n') else: # Reprocess the histogram of repositories by stars stars_reprocessed_file_path = stars_file_path.replace( '.csv', '_reprocessed.csv') reprocess_stars_histogram(token, args.language, args.date, stars_file_path) print( f'\nStars reprocessed file successfully saved on {stars_reprocessed_file_path}\n' ) # Print finish time processing end_time = datetime.now() print(f'Crawling finished at {end_time}\n') print('>> Crawling finished in', end_time - start_time, '<<')
def get_repositories_by_time(token, metadata_path, language, start_date, end_date=None): q_language = f'%3A\"{language}\"' q_per_page = '&per_page=100' # path to save crawling files crawler_path = os.path.join(utils.get_main_path(), 'data', 'crawler', 'repositories', language.lower(), 'daily_crawler') # original query based on language and stars base_query = f'https://api.github.com/search/repositories?q=stars%3A>0+created%3A>2010-01-01+language{q_language}' if not end_date: # if end date is not given, use the current date end_date = datetime.now().strftime('%Y-%m-%d') # get all the dates for crawling days = list(pd.date_range(start_date, end_date, freq='d')) str_days = [d.strftime('%Y-%m-%d') for d in days] for date in str_days: q_date = f'%3A{date}' date_query = base_query + f'+pushed{q_date}' r_date = requests.get(date_query, headers={'Authorization': 'token %s' % token}) data = json.loads(r_date.content) total_count = data['total_count'] print(f'Requesting repositories for {date} - {total_count} results') # verify the request time from API api.verify_request_time(token, 'search') if total_count <= 1000: page = 1 while data['items'] and page <= 10: print(f'Requesting repositories for {date} - page {page}') q_page = f'&page={page}' complete_query = date_query + q_per_page + q_page file_crawler_path = os.path.join(crawler_path, f'{language.lower()}_{date}_2010_{page}.csv') data = save_result_query(token, complete_query, file_crawler_path) # log progress save_progress_metadata(metadata_path, language, 0, '2020-01-01', date, page, data['total_count'], data['incomplete_results'], complete_query) page = page + 1 else: for year in range(2010, int(end_date[:4]) + 1): q_creation_date = f'%3A{year}-01-01..{year}-12-31' # new partitions by creation date new_date_query = date_query.replace('+created%3A>2010-01-01', f'+created{q_creation_date}') r_date = requests.get(new_date_query, headers={'Authorization': 'token %s' % token}) data = json.loads(r_date.content) total_count = data['total_count'] print(f'Requesting repositories for {date} and creation year {year} - {total_count} results') monthly_dividing = False if total_count > 1000: monthly_dividing = True # verify the request time from API api.verify_request_time(token, 'search') if not monthly_dividing: page = 1 while data['items'] and page <= 10: print(f'Requesting repositories for {date} and creation year {year} - page {page}') q_page = f'&page={page}' new_complete_query = new_date_query + q_per_page + q_page new_file_crawler_path = os.path.join(crawler_path, f'{language.lower()}_{date}_{year}_{page}.csv') data = save_result_query(token, new_complete_query, new_file_crawler_path) # log progress save_progress_metadata(metadata_path, language, 0, f'{year}-01-01..{year}-12-31', date, page, data['total_count'], data['incomplete_results'], new_complete_query) page = page + 1 else: month_groups = {1: ('01-01', '03-31'), 2: ('04-01', '06-30'), 3: ('07-01', '08-31'), 4: ('09-01', '10-31'), 5: ('11-01', '11-30'), 6: ('12-01', '12-31')} for month in month_groups: q_creation_date = f'%3A{year}-{month_groups[month][0]}..{year}-{month_groups[month][1]}' # new partitions by creation date monthly_date_query = date_query.replace('+created%3A>2010-01-01', f'+created{q_creation_date}') r_date = requests.get(monthly_date_query, headers={'Authorization': 'token %s' % token}) data = json.loads(r_date.content) total_count = data['total_count'] print(f'Requesting repositories for {date} and creation year {year} - monthly division {month} - {total_count} results') # verify the request time from API api.verify_request_time(token, 'search') page = 1 while data['items'] and page <= 10: print(f'Requesting repositories for {date} and creation year {year} - monthly division {month} - page {page}') q_page = f'&page={page}' monthly_complete_query = monthly_date_query + q_per_page + q_page monthly_file_crawler_path = os.path.join(crawler_path, f'{language.lower()}_{date}_{year}_{month}_{page}.csv') data = save_result_query(token, monthly_complete_query, monthly_file_crawler_path) # log progress save_progress_metadata(metadata_path, language, 0, f'{year}-{month_groups[month][0]}..{year}-{month_groups[month][1]}', date, page, data['total_count'], data['incomplete_results'], monthly_complete_query) page = page + 1