def load_broken_notifications(self): """Load broken notifications by type""" broken_notice_info = None for model_name, type_id_list in get_dv_object_to_object_id_map().items(): # Get a list of object ids for this model type # that were not emailed--e.g. should show up # on the notifications pages # msgt('check: %s %s' % (model_name, type_id_list)) # If there's a selected_model_name, then only process that model # if self.selected_model_name is None: pass # check all models elif model_name != self.selected_model_name: # We have a selected_model_name and this isn't it! continue model_user_id_list = UserNotification.objects.select_related('user'\ ).filter(\ object_type__in=type_id_list, ).values_list('objectid', 'user__id') if len(model_user_id_list) == 0: continue # retrieve the object ids only model_id_list = [x[0] for x in model_user_id_list] unique_id_list = list(set(model_id_list)) # Next line is a hack - Need to upgrade Django apps # to not use this method model_class = eval(model_name) if model_name in ['DvObject', 'DatasetVersion', 'FileMetadata']: existing_ids = model_class.objects.filter(id__in=unique_id_list\ ).values_list('id', flat=True\ ).distinct() else: existing_ids = model_class.objects.select_related('dvobject'\ ).filter(dvobject__id__in=unique_id_list\ ).values_list('dvobject__id', flat=True\ ).distinct() if len(unique_id_list) == len(existing_ids): # Looks good! # No notifications where object no longer exists continue # Create a list of the missing ids # missing_ids = list(set(unique_id_list) - set(existing_ids)) # Record broken notification info # broken_notice_info = BrokenNotificationInfo(\ model_name, list(model_user_id_list), missing_ids) self.broken_info_list.append(broken_notice_info)
def download_file(url_to_file): """Download a Dataverse file and return the filename""" """ import re d = r.headers['content-disposition'] fname = re.findall("filename=(.+)", d) """ file_handle, filepath = tempfile.mkstemp() msgt('download file: %s' % url_to_file) r = requests.get(url_to_file, stream=True) if r.status_code != 200: msg('bad status: %s' % r.status_code) if isfile(filepath): make_sure_file_deleted(filepath) return None, None file_ext = None content_dict = r.headers['content-disposition'] #print 'content_dict', content_dict #fname = re.findall("filename=(.+)", content_dict) fname = format_file_name(content_dict) if fname: file_ext = fname.split('.')[-1].lower() print 'file_ext', file_ext with os.fdopen(file_handle, 'wb') as tmp: for chunk in r.iter_content(chunk_size=1024): if chunk: tmp.write(chunk) msg('File downloaded: %s' % filepath) return filepath, file_ext
def show_diffs(self): """Debug: print out differences""" section = None for diff_obj in self.diff_list: if section != diff_obj.section: section = diff_obj.section msgt(section) diff_obj.show(show_section=False)
def show(self, show_section=True): """print info""" if show_section: msgt('%s: [%s] %s' % (self.section, self.attr_name, self.note)) msg('attribute: %s' % self.attr_name) msg('\nnew: %s' % self.new_val) msg('\nold: %s' % self.old_val) dashes()
def show_elapsed_time(self, start_time): """From http://stackoverflow.com/questions/1345827/how-do-i-find-the-time-difference-between-two-datetime-objects-in-python""" time_now = int(time.time()) # epoch seconds days = divmod(time_now - start_time, 86400) # days hours = divmod(days[1], 3600) # hours minutes = divmod(hours[1],60) # minutes seconds = minutes[1] # seconds msgt('Elapsed time: %d day(s), %d hour(s), %d minute(s), %d second(s)' % (days[0],hours[0],minutes[0],seconds))
def load_file_as_dict(fname): """Load a file as a python dict""" msgt('load file: %s' % fname) assert isfile(fname), '%s is not file' % fname fcontent = open(fname, 'r').read() fcontent = fcontent.replace('\r\n', '\\r\\n') dict_info = json.loads(fcontent, object_pairs_hook=OrderedDict) if dict_info.has_key('data'): return dict_info['data'] return dict_info
def show_elapsed_time(self, start_time): """From http://stackoverflow.com/questions/1345827/how-do-i-find-the-time-difference-between-two-datetime-objects-in-python""" time_now = int(time.time()) # epoch seconds days = divmod(time_now - start_time, 86400) # days hours = divmod(days[1], 3600) # hours minutes = divmod(hours[1], 60) # minutes seconds = minutes[1] # seconds msgt( 'Elapsed time: %d day(s), %d hour(s), %d minute(s), %d second(s)' % (days[0], hours[0], minutes[0], seconds))
def write_files_to_mongo(self, **kwargs): """Write the saved dataset files to Mongo""" client = MongoClient() db = client.dataverse_database collection = db.datasets # look at kwargs # dataset_start_id = kwargs.get('dataset_start_id', 0) delete_all = kwargs.get('delete_all', False) # If appropriate, Delete existing records # if delete_all: msgt('Deleting current records') result = collection.delete_many({}) msg('result.deleted_count: %s' % result.deleted_count) return fnames = os.listdir(self.output_dir) fnames = [ x for x in fnames if x.endswith('.json') and x.startswith('ds_') ] fnames.sort() start_time = int(time.time()) # epoch seconds cnt = 0 for fname in fnames: cnt += 1 ds_id = int(fname.split('.')[0].split('_')[1]) msgt('(%d) process dataset %s (%s)' % (cnt, ds_id, fname)) if ds_id < dataset_start_id: msg('skipping it') continue content = open(join(self.output_dir, fname), 'r').read() content = update_json_text(content) content_doc = json.loads(content, object_pairs_hook=OrderedDict) content_doc['_id'] = ds_id content_doc['dtype'] = 'dataset' #doc_id = collection.insert_one(content_doc).inserted_id #doc_id = collection.save(content_doc) #.inserted_id doc_id = collection.save(content_doc) if cnt % 500 == 0: self.show_elapsed_time(start_time) self.show_elapsed_time(start_time)
def write_files_to_mongo(self, **kwargs): """Write the saved dataset files to Mongo""" client = MongoClient() db = client.dataverse_database collection = db.datasets # look at kwargs # dataset_start_id = kwargs.get('dataset_start_id', 0) delete_all = kwargs.get('delete_all', False) # If appropriate, Delete existing records # if delete_all: msgt('Deleting current records') result = collection.delete_many({}) msg('result.deleted_count: %s' % result.deleted_count) return fnames = os.listdir(self.output_dir) fnames = [x for x in fnames if x.endswith('.json') and x.startswith('ds_')] fnames.sort() start_time = int(time.time()) # epoch seconds cnt = 0 for fname in fnames: cnt += 1 ds_id = int(fname.split('.')[0].split('_')[1]) msgt('(%d) process dataset %s (%s)' % (cnt, ds_id, fname)) if ds_id < dataset_start_id: msg('skipping it') continue content = open(join(self.output_dir, fname), 'r').read() content = update_json_text(content) content_doc = json.loads(content, object_pairs_hook=OrderedDict) content_doc['_id'] = ds_id content_doc['dtype'] = 'dataset' #doc_id = collection.insert_one(content_doc).inserted_id #doc_id = collection.save(content_doc) #.inserted_id doc_id = collection.save(content_doc) if cnt % 500 == 0: self.show_elapsed_time(start_time) self.show_elapsed_time(start_time)
def test_search_mongo(self, term='law'): """Test searches""" client = MongoClient() db = client.dataverse_database collection = db.datasets # Compass: # # {"title": {$regex: "(^Law| Law | Law$)"}} """ {"title":{"$regex":"(^Law| Law | Law$)","$options":"i"},"metadata_blocks.citation.dsDescription.dsDescriptionValue": {"$regex":"(^Law| Law | Law$)","$options":"i"}} """ field_names = [ 'metadata_blocks.citation.dsDescription.dsDescriptionValue', #'title', #'metadata_blocks.citation.subject', #'metadata_blocks.citation.keyword.keywordValue', ] qlist = [] for field_name in field_names: qlist.append({ field_name: { '$regex': '(^{0}|\s{0}\s|\s{0}$)'.format(term), '$options': 'i' } }) docs = collection.find({"$or": qlist}) # ----------------------------- #field_name = 'title' #field_name = 'metadata_blocks.citation.dsDescription.dsDescriptionValue' #docs = collection.find({field_name:{'$regex':'(^Law|\sLaw\s|\sLaw$)', '$options':'i'}}) #docs = collection.find({'title':{'$regex':'(^Law|\sLaw\s|\sLaw$)', '$options':'i'}}) from dict_map_util import DictMapUtil cnt = 0 for doc in docs: cnt += 1 msgt('(%d) %s' % (cnt, doc['title'])) dmap_str = 'dmap.' + field_names[0] print 'dmap_str', dmap_str m = DictMapUtil(doc) import ipdb ipdb.set_trace() #print eval(dmap_str) break """
def test_search_mongo(self, term='law'): """Test searches""" client = MongoClient() db = client.dataverse_database collection = db.datasets # Compass: # # {"title": {$regex: "(^Law| Law | Law$)"}} """ {"title":{"$regex":"(^Law| Law | Law$)","$options":"i"},"metadata_blocks.citation.dsDescription.dsDescriptionValue": {"$regex":"(^Law| Law | Law$)","$options":"i"}} """ field_names = [ 'metadata_blocks.citation.dsDescription.dsDescriptionValue', #'title', #'metadata_blocks.citation.subject', #'metadata_blocks.citation.keyword.keywordValue', ] qlist = [] for field_name in field_names: qlist.append({ field_name: {'$regex':'(^{0}|\s{0}\s|\s{0}$)'.format(term), '$options':'i'}} ) docs = collection.find({"$or": qlist}) # ----------------------------- #field_name = 'title' #field_name = 'metadata_blocks.citation.dsDescription.dsDescriptionValue' #docs = collection.find({field_name:{'$regex':'(^Law|\sLaw\s|\sLaw$)', '$options':'i'}}) #docs = collection.find({'title':{'$regex':'(^Law|\sLaw\s|\sLaw$)', '$options':'i'}}) from dict_map_util import DictMapUtil cnt = 0 for doc in docs: cnt += 1 msgt('(%d) %s' % (cnt, doc['title'])) dmap_str = 'dmap.' + field_names[0] print 'dmap_str', dmap_str m = DictMapUtil(doc) #print eval(dmap_str) break """
def run_comparison(self): """Compare the two JSON datasets""" msgt(self.run_comparison.__doc__) # Run a quick check to see if the dicts are the same. # if cmp(self.old_ds, self.new_ds) == 0: msg('No differences!') return new_files_list = self.new_ds.pop('files', []) old_files_list = self.old_ds.pop('files', []) #print 'new_files_list', new_files_list self.compare_dicts(\ '', self.new_ds, self.old_ds) self.compare_file_lists(\ new_files_list, old_files_list)
def download_file(url_to_file): """Download a Dataverse file and return the filename""" file_handle, filepath = tempfile.mkstemp() msgt('download file: %s' % url_to_file) r = requests.get(url_to_file, stream=True) if r.status_code != 200: msg('bad status: %s' % r.status_code) if isfile(filepath): make_sure_file_deleted(filepath) return None with os.fdopen(file_handle, 'wb') as tmp: for chunk in r.iter_content(chunk_size=1024): if chunk: tmp.write(chunk) msg('File downloaded: %s' % filepath) return filepath
def make_json_files(self): # Set publication status # filters = {} if self.published_only: filters.update(query_helper.get_is_published_filter_param()) # Query for dataset ids # ds_id_query = Dataset.objects.filter(**filters\ ).annotate(ds_id=F('dvobject__id')\ ).values_list('ds_id', flat=True\ ).order_by('ds_id') # Iterate through dataset ids # #start_time = datetime.now() start_time = int(time.time()) # epoch seconds cnt = 0 no_versions_found_list = [45900] for ds_id in ds_id_query: cnt += 1 msgt('(%d) Checking dataset id %s' % (cnt, ds_id)) if ds_id < self.dataset_start_id: msg('skipping...(start at dataset id: %d)' % self.dataset_start_id) continue # Create file name # fname = 'ds_%s.json' % (str(ds_id).zfill(8)) full_fname = join(OUTPUT_DIR, fname) # Should we overwrite the existing file? # if isfile(full_fname) and not self.overwrite_existing_files: msg('skipping...file already exists') continue dataset_version = get_latest_dataset_version(ds_id) if dataset_version is None: msg("Could not find dataset_version!") no_versions_found_list.append(ds_id) continue dataset_as_json = DatasetSerializer(dataset_version).as_json() open(full_fname, 'w').write(json.dumps(dataset_as_json, indent=4)) msg('File written: %s' % full_fname) if cnt % 500 == 0: self.show_elapsed_time(start_time) #if cnt > 10: # self.show_elapsed_time(start_time) # break self.show_elapsed_time(start_time) print 'no_versions_found_list: %s' % no_versions_found_list
def load_broken_notifications(self): """Load broken notifications by type""" broken_notice_info = None for model_name, type_id_list in get_dv_object_to_object_id_map().items( ): # Get a list of object ids for this model type # that were not emailed--e.g. should show up # on the notifications pages # msgt('check: %s %s' % (model_name, type_id_list)) # If there's a selected_model_name, then only process that model # if self.selected_model_name is None: pass # check all models elif model_name != self.selected_model_name: # We have a selected_model_name and this isn't it! continue model_user_id_list = UserNotification.objects.select_related('user'\ ).filter(\ object_type__in=type_id_list, ).values_list('objectid', 'user__id') if len(model_user_id_list) == 0: continue # retrieve the object ids only model_id_list = [x[0] for x in model_user_id_list] unique_id_list = list(set(model_id_list)) # Next line is a hack - Need to upgrade Django apps # to not use this method model_class = eval(model_name) if model_name in ['DvObject', 'DatasetVersion', 'FileMetadata']: existing_ids = model_class.objects.filter(id__in=unique_id_list\ ).values_list('id', flat=True\ ).distinct() else: existing_ids = model_class.objects.select_related('dvobject'\ ).filter(dvobject__id__in=unique_id_list\ ).values_list('dvobject__id', flat=True\ ).distinct() if len(unique_id_list) == len(existing_ids): # Looks good! # No notifications where object no longer exists continue # Create a list of the missing ids # missing_ids = list(set(unique_id_list) - set(existing_ids)) # Record broken notification info # broken_notice_info = BrokenNotificationInfo(\ model_name, list(model_user_id_list), missing_ids) self.broken_info_list.append(broken_notice_info)
def get_count_broken_notifications(): """ Query each object type and make sure notifications aren't broken Example map { 'DvObject': [1], 'Dataverse': [2], 'Dataset': [14, 11], 'DatasetVersion': [13, 12, 7], 'DataFile': [9] } """ broken_cnt = 0 user_ids = [] for model_name, type_id_list in get_dv_object_to_object_id_map().items( ): # Get a list of object ids for this model type # that were not emailed--e.g. should show up # on the notifications pages # msgt('check: %s %s' % (model_name, type_id_list)) model_user_id_list = UserNotification.objects.select_related('user'\ ).filter(\ object_type__in=type_id_list, ).values_list('objectid', 'user__id') model_id_list = [x[0] for x in model_user_id_list] user_ids += [x[1] for x in model_user_id_list] msg('model_id_list len: %s' % len(model_id_list)) if len(model_id_list) == 0: continue # Used for later "bad notice" counts notice_counter = Counter(model_id_list) msg('notice_counter len: %s' % len(notice_counter)) unique_id_list = list(set(model_id_list)) msg('unique_id_list len: %s' % len(unique_id_list)) # Need to upgrade apps files and not use this method model_class = eval(model_name) if model_name in ['DvObject', 'DatasetVersion', 'FileMetadata']: existing_ids = model_class.objects.filter(id__in=unique_id_list\ ).values_list('id', flat=True\ ).distinct() else: existing_ids = model_class.objects.select_related('dvobject'\ ).filter(dvobject__id__in=unique_id_list\ ).values_list('dvobject__id', flat=True\ ).distinct() msg('existing_ids len: %s' % len(existing_ids)) if len(unique_id_list) == len(existing_ids): # Looks good! continue missing_ids = list(set(unique_id_list) - set(existing_ids)) for missing_id in missing_ids: broken_cnt += notice_counter.get(missing_id, 0) unique_user_ids = len(set(user_ids)) return (broken_cnt, unique_user_ids)
def get_data_rows(self, as_json=False, pretty_print=False): """ Return information as JSON { "data" : "total_row_count" : 117 "preview_row_count" : 50 "column_names" : ["Name", "Position", "Office"] "rows" : [ [ "Tiger Nixon", "System Architect", "Edinburgh" ], [ "Garrett Winters", "Accountant", "Tokyo" ] ] } """ if self.has_error(): return None # Read the table try: if self.is_excel: msgt('Excel!') df = pd.read_excel(self.filepath) #error_bad_lines=False) else: df = pd.read_table(self.filepath) except Exception as ex_obj: msg(ex_obj) msgt('Failed to open file via pandas!') temp_file_helper.make_sure_file_deleted(self.filepath) if self.is_excel: self.add_error('Failed to open Excel file via pandas. [%s]' % ex_obj) else: self.add_error( '<b>Probably not a tabular file!</b> Failed to open file via pandas. [%s]' % ex_obj) return None self.describe_as_html = df.describe().to_html() json_string = df.describe().to_json() self.describe_as_dict = json.loads(json_string, object_pairs_hook=OrderedDict) # Retrieve the columns self.column_names = df.columns.tolist() # Retrieve the rows self.data_rows = df[:self.num_preview_rows].values.tolist() #print 'rows', json.dumps(rows) # Format the response info_dict = OrderedDict() info_dict['total_row_count'] = len(df.index) info_dict['preview_row_count'] = len(self.data_rows) info_dict['column_names'] = self.column_names info_dict['rows'] = self.data_rows info_dict['describe_as_html'] = self.describe_as_html info_dict['describe_as_dict'] = self.describe_as_dict if as_json: if pretty_print: return json.dumps(info_dict, indent=4) return json.dumps(info_dict) return info_dict
def get_count_broken_notifications(): """ Query each object type and make sure notifications aren't broken Example map { 'DvObject': [1], 'Dataverse': [2], 'Dataset': [14, 11], 'DatasetVersion': [13, 12, 7], 'DataFile': [9] } """ broken_cnt = 0 user_ids = [] for model_name, type_id_list in get_dv_object_to_object_id_map().items(): # Get a list of object ids for this model type # that were not emailed--e.g. should show up # on the notifications pages # msgt('check: %s %s' % (model_name, type_id_list)) model_user_id_list = UserNotification.objects.select_related('user'\ ).filter(\ object_type__in=type_id_list, ).values_list('objectid', 'user__id') model_id_list = [x[0] for x in model_user_id_list] user_ids += [x[1] for x in model_user_id_list] msg('model_id_list len: %s' % len(model_id_list)) if len(model_id_list) == 0: continue # Used for later "bad notice" counts notice_counter = Counter(model_id_list) msg('notice_counter len: %s' % len(notice_counter)) unique_id_list = list(set(model_id_list)) msg('unique_id_list len: %s' % len(unique_id_list)) # Need to upgrade apps files and not use this method model_class = eval(model_name) if model_name in ['DvObject', 'DatasetVersion', 'FileMetadata']: existing_ids = model_class.objects.filter(id__in=unique_id_list\ ).values_list('id', flat=True\ ).distinct() else: existing_ids = model_class.objects.select_related('dvobject'\ ).filter(dvobject__id__in=unique_id_list\ ).values_list('dvobject__id', flat=True\ ).distinct() msg('existing_ids len: %s' % len(existing_ids)) if len(unique_id_list) == len(existing_ids): # Looks good! continue missing_ids = list(set(unique_id_list) - set(existing_ids)) for missing_id in missing_ids: broken_cnt += notice_counter.get(missing_id, 0) unique_user_ids = len(set(user_ids)) return (broken_cnt, unique_user_ids)