def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: raise CommandError('You must specify a --date parameter in the ' ' YYYY-MM-DD format.') sep = options['separator'] filename = args[0] # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict(File.objects.values_list('id', 'version__addon_id')) with open(filename) as count_file: for index, line in enumerate(count_file): if index and (index % 10000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 3: log.debug('Badly formatted row: %s' % line) continue counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: log.info('File with id: %s not found' % file_id) continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) total_time = (datetime.now() - start).total_seconds() log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s seconds' % total_time)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict(File.objects.values_list('id', 'version__addon_id')) # Only accept valid sources, which are listed in the DownloadSource # model. The source must either be exactly one of the "full" valid # sources, or prefixed by one of the "prefix" valid sources. fulls = set(DownloadSource.objects.filter(type='full').values_list( 'name', flat=True)) prefixes = DownloadSource.objects.filter(type='prefix').values_list( 'name', flat=True) with codecs.open(filepath, encoding='utf8') as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue if not is_valid_source(src, fulls=fulls, prefixes=prefixes): continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.NETAPP_STORAGE, 'tmp', folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict(File.objects.values_list('id', 'version__addon_id')) with open(filepath) as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue # Drop incorrect sources: hive newline, ffsync and getpersona. if src in ('\N', 'sync', 'gp'): continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.NETAPP_STORAGE, 'tmp', folder, day) sep = options['separator'] filepath = path.join(folder, 'download_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. DownloadCount.objects.filter(date=day).delete() # Memoize the files to addon relations and the DownloadCounts. download_counts = {} # Perf: preload all the files once and for all. # This builds a dict where each key (the file_id we get from the hive # query) has the addon_id as value. files_to_addon = dict( File.objects.values_list('id', 'version__addon_id')) with open(filepath) as count_file: for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, counter, file_id, src = splitted try: file_id, counter = int(file_id), int(counter) except ValueError: # Badly formatted? Drop. continue # Drop incorrect sources: hive newline, ffsync and getpersona. if src in ('\N', 'sync', 'gp'): continue # Does this file exist? if file_id in files_to_addon: addon_id = files_to_addon[file_id] else: continue # Memoize the DownloadCount. if addon_id in download_counts: dc = download_counts[addon_id] else: dc = DownloadCount(date=day, addon_id=addon_id, count=0) download_counts[addon_id] = dc # We can now fill the DownloadCount object. dc.count += counter dc.sources = update_inc(dc.sources, src, counter) # Create in bulk: this is much faster. DownloadCount.objects.bulk_create(download_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start))