def new_bib_fixture(self, bib_file): import csv from tech_services_reports.utility_code import convert_date from tech_services_reports.models import Bib, Item bibs = csv.reader(open(bib_file)) bibs.next() bib_count = 0 bib_fixture = [] for row in bibs: if len(row) > 1: bib_count += 1 number = row[0] cat_date = convert_date(row[1]) level = row[2] items = row[3:] if len(items) > 1000: print >> sys.stderr, number, len(items) continue bdict = {} bdict['pk'] = number bdict['model'] = 'tech_services_reports.bib' _fields = {} _fields['level'] = level if cat_date: _fields['cat_date'] = cat_date.strftime("%Y-%m-%d") else: cat_date = None _fields['items'] = items bdict['fields'] = _fields bib_fixture.append(bdict) print >> sys.stderr, "Total items: %d." % bib_count print simplejson.dumps(bib_fixture, indent=2)
def new_items(self, item_file): import csv from tech_services_reports.utility_code import convert_date, AcquisitionMethod from tech_services_reports.models import Item items = csv.reader(open(item_file)) items.next() #Get mappings from service location_format_map = simplejson.load( urllib.urlopen(settings_app.LOCATION_FORMAT_URL)) location_format_map = location_format_map['result']['items'] #RECORD #(ITEM) CREATED(ITEM) LOCATION 999 RECORD #(BIBLIO) item_count = 0 for row in items: if len(row) > 1: item_count += 1 number = row[0] create_date = convert_date(row[1]) location = row[2].strip() acq_note = row[3] bibs = row[4:] item, obj_created = Item.objects.get_or_create(number=number) item.created = create_date print >> sys.stderr, item, obj_created #Normalize to lower case item.location = location.lower() item.format = location_format_map[location]['format'] item.acquisition_method = AcquisitionMethod(acq_note).note item.save()
def new_marc(self, marc_file): from tech_services_reports.utility_code import convert_date, CatStat from tech_services_reports.models import Bib, Item from datetime import date from pymarc import MARCReader fixture = [] cedit_count = 0 for record in MARCReader(file(marc_file)): bib_number = record['907']['a'][1:] bib_level = record['998']['c'] cat_date = convert_date(record['998']['b']) cat_stat = CatStat(record) no_items = False #Create Bib fixture bdict = {} bdict['pk'] = bib_number bdict['model'] = 'tech_services_reports.bib' _fields = {} _fields['level'] = bib_level _fields['cat_type'] = cat_stat.cattype _fields['mat_type'] = cat_stat.mattype if cat_date: _fields['cat_date'] = cat_date.strftime("%Y-%m-%d") else: cat_date = None items = record.get_fields('945') #Need to do a check to see if any attached items #were created after the check d #Prep item fixture and append to main fixture _i = self.item_fixture(items) item_fixture = _i['fixture'] valid_items = _i['valid_items'] if len(_i['valid_items']) == 0: no_items = True #Checks to see if this bib, items pair is within reporting range. #The item_fixture function will skip items that aren't created #within the reporting range. #Skip pairs without a cat date and no items. if not cat_date: if no_items: continue #Allow pairs with no cat date but with items to be added to the fixture. #Skip pairs with an old cat date and no items. if cat_date: if cat_date < date(2010, 10, 01): if no_items: continue #Item fixture is a list so we need to add it to main list. fixture += item_fixture _fields['items'] = valid_items bdict['fields'] = _fields #This is just a dict so append. fixture.append(bdict) #Get cat edits marc_995 = record.get_fields('995') cedit = self.cat_edit_fixture(marc_995, bib_number) fixture += cedit['fixture'] cedit_count += len(cedit['fixture'])
def item_fixture(self, marc_items): import urllib from tech_services_reports.utility_code import convert_date, AcquisitionMethod item_fixture = [] item_count = 0 valid_items = [] for item in marc_items: try: item_number = item['y'].strip()[1:] #No item number? except AttributeError: continue item_created = convert_date(item['z']) try: item_location = item['l'].strip() except AttributeError: item_location = 'unknown' item_acc_note = item[settings_app.ITEM_ACC_NOTE] #Prep DB fixture #Skip items without create dates. if not item_created: continue #Skip items not created after the reporting start date. if item_created < date(settings_app.BEGIN_YEAR, settings_app.BEGIN_MONTH, 1): continue try: acquisition_method = AcquisitionMethod(item_acc_note).note except NameError: #print>>sys.stderr, "Can't find acq method for %s." % item_acc_note acquisition_method = None except AttributeError: #print>>sys.stderr, "Skipping, no acq note for %s." % item_number #continue acquisition_method = None try: item_format = location_format_map[item_location]['format'] except KeyError: #print>>sys.stderr, "%s is an unknown location code." % item_location item_format = None idict = {} idict['pk'] = item_number idict['model'] = 'tech_services_reports.item' _fields = {} _fields['acquisition_method'] = acquisition_method _fields['format'] = item_format _fields['location'] = item_location _fields['created'] = item_created.strftime("%Y-%m-%d") idict['fields'] = _fields item_fixture.append(idict) valid_items.append(item_number) item_count += 1 return {'fixture': item_fixture, 'valid_items': valid_items}
def get_first_item( items ): """Get the first attached item and use that as the accessions count date. E.g. Items will be counted as an accessioned title on the date of the first attached item.""" first = datetime.date(1900, 1, 1) for count, item in enumerate(items): item_created = utility_code.convert_date(item['z']) if not item_created: continue #Initialize first attached item date. if count == 0: first = item_created if item_created < first: first = item_created return first
def first_item(self, items): """Get the first attached item and use that as the accessions count date. E.g. Items will be counted as an accessioned title on the date of the first attached item.""" from tech_services_reports.utility_code import convert_date from datetime import date first = date(1900, 1, 1) for count, item in enumerate(items): item_created = convert_date(item['z']) #Initialize first attached item date. if count == 0: first = item_created if item_created < first: first = item_created return first
def new_item_fixture(self, item_file): import csv import urllib from tech_services_reports.utility_code import convert_date, AcquisitionMethod from tech_services_reports.models import Item items = csv.reader(open(item_file)) items.next() #Get mappings from service location_format_map = simplejson.load( urllib.urlopen(settings_app.LOCATION_FORMAT_URL)) location_format_map = location_format_map['result']['items'] #RECORD #(ITEM) CREATED(ITEM) LOCATION 999 RECORD #(BIBLIO) item_count = 0 item_fixture = [] for row in items: if len(row) > 1: item_count += 1 number = row[0] create_date = convert_date(row[1]) location = row[2].strip() acq_note = row[3] try: acquisition_method = AcquisitionMethod(acq_note).note except NameError: acquisition_method = None try: format = location_format_map[location]['format'] except KeyError: print >> sys.stderr, "%s is an unkown location code." % location format = None bibs = row[4:] idict = {} idict['pk'] = number idict['model'] = 'tech_services_reports.item' _fields = {} _fields['acquisition_method'] = acquisition_method _fields['format'] = format _fields['location'] = location _fields['created'] = create_date.strftime("%Y-%m-%d") idict['fields'] = _fields item_fixture.append(idict) print >> sys.stderr, "Total items: %d." % item_count print simplejson.dumps(item_fixture, indent=2)
def new_bibs(self, bib_file): import csv from tech_services_reports.utility_code import convert_date from tech_services_reports.models import Bib, Item bibs = csv.reader(open(bib_file)) bibs.next() bib_count = 0 for row in bibs: if len(row) > 1: bib_count += 1 number = row[0] cat_date = convert_date(row[1]) level = row[2] items = row[3:] bib, o_created = Bib.objects.get_or_create(number=number) bib.level = level bib.cat_date = cat_date print >> sys.stderr, bib, o_created, cat_date for item in items: iobj, icreated = Item.objects.get_or_create(number=item) bib.items.add(iobj) bib.save()
def get_cat_date( self, record ): """ Extracts cat date string and attempts to return a date-object. Called by parse_record() """ datestr = record['998']['b'].strip() self.cat_date = utility_code.convert_date(datestr) return self.cat_date
def count_volumes( marc_items, cat_date, material_type, counted_items, location_format_map ): """Create summary accession info for items created within given range.""" log.debug( 'starting count_volumes()' ) from tech_services_reports.utility_code import convert_date, AcquisitionMethod from tech_services_reports.helpers import defaultdict as DD from tech_services_reports.helpers import namedtuple from datetime import date # log.debug( 'marc_items[0:2], ```{}```'.format( pprint.pformat(marc_items[0:2]) ) ) # log.debug( 'cat_date, `{}`'.format(cat_date) ) # log.debug( 'material_type, `{}`'.format(material_type) ) # log.debug( 'list(counted_items)[0:2], ```{}```'.format( pprint.pformat(list(counted_items)[0:2]) ) ) # this stays the same?? summary = DD(int) summary_titles = DD(int) #Marker to hold whether this title has been counted as an accession. title_counted = False #Named tuple used as key for storing totals. #Method names need to match models. Acc = namedtuple('acc_summary', ['number', 'created', 'acquisition_method', 'format', 'location', 'serial_added_volume'], verbose=False) #For determine if a title is accessioned. #Only need to find first attached items for those with more than #one item. #if len(marc_items) > 1: #Get the first items for serials. if material_type != 's': first_item = get_first_item( marc_items ) else: first_item = cat_date #else: #first_item = date.today() #print>>sys.stderr, first_item for item in marc_items: try: item_number = item['y'] if not item_number: #print>>sys.stderr, 'no item number? ', item continue if item_number: item_number = item_number.lstrip('.') except KeyError: #print>>sys.stderr, 'no item number? ', item continue #Get acc note, skip anything without one. item_acc_note = item[settings_app.ITEM_ACC_NOTE] if not item_acc_note: #print>>sys.stderr, 'no accession note ', item_number continue item_created = convert_date(item['z']) #Yes, some item records don't have a created date. if not item_created: #print>>sys.stderr, 'no item create date? ', item continue #Skip items from before system was implemented. if item_created.year < settings_app.BEGIN_YEAR: #print>>sys.stderr, 'too old ', item_number, item_created continue if item_created.year == settings_app.BEGIN_YEAR: if item_created.month < settings_app.BEGIN_MONTH: #print>>sys.stderr, 'too old ', item_number, item_created continue #Don't count old stuff. #if TODAY - item_created < CUTOFF_DAY_DELTA: # #print>>sys.stderr, "Old accession skipping. %s" % first_item # continue #Skip known items if item_number in counted_items: #print>>sys.stderr, item_number, ' already counted. skipping.' #print>>sys.stderr, '-', continue #Determine bib's accession date by try: if not item['l']: # print>>sys.stderr, 'no location code ', item_number log.warning( 'no location code; item_number, `{}`'.format(item_number) ) continue raw_location = item['l'].strip() #Store raw location codes in case building names change in the future. #This will make display tricky. item_location = raw_location #item_location = location_format_map[raw_location]['building'] except KeyError: #item_location = 'unknown' item_location = 'unknown' try: acquisition_method = AcquisitionMethod(item_acc_note).note except NameError as e: # print>>sys.stderr, item, e log.warning( 'error instantiating AcquisitionMethod(); error logged' ) log.info( 'error instantiating AcquisitionMethod();\nitem, ```{itm}```;\ninfo, ```{err}```'.format( itm=item, err=repr(e) ) ) continue try: item_format = location_format_map[raw_location]['format'] except KeyError: #print>>sys.stderr, "%s is an unknown location code." % item_location item_format = 'unknown' #Serial added volumes: item record create date > bib record cat date AND bib record bib level equals = serial serial_added_volume = False if cat_date: if item_created > cat_date: if material_type == 's': serial_added_volume = True _key = Acc(number=item_number, created=item_created, acquisition_method=acquisition_method, format=item_format, location=item_location, serial_added_volume=serial_added_volume) summary[_key] += 1 #Add to the title count. if not title_counted: try: if item_created <= first_item: summary_titles[_key] += 1 title_counted = True except TypeError: pass return_val = {'volumes': dict(summary), 'titles': dict(summary_titles)} if return_val['volumes'] or return_val['titles']: log.debug( 'return_val, ```{}```'.format( pprint.pformat(return_val) ) ) return return_val
def get_bib_created( self, record ): """ Extracts bib date string and attempts to return a date-object. Called by parse_record() """ datestr = record['907']['c'].strip() self.bib_created = utility_code.convert_date(datestr) return self.bib_created
def get_bib_created( this_record ): d = this_record['907']['c'] return utility_code.convert_date(d)
def process_marc_file( marc_file, existing_items, location_format_map ): counter = 0 cataloging_edit_count = {} title_count = {} volume_count = {} with open( marc_file, 'rb' ) as fh: start = datetime.datetime.now() fh.seek( 0, 2 ); file_size = fh.tell(); fh.seek( 0 ) log.debug( 'file_size(K), `{}`'.format( file_size/1024 ) ) count_processed = 0; count_good = 0; count_bad = 0 last_position = 0; current_position = 0 segment_to_review = 'init' # reader = pymarc.MARCReader( fh, to_unicode=True, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( fh, force_utf8=True, utf8_handling='ignore' ) # reader = pymarc.MARCReader( fh, utf8_handling='ignore' ) reader = pymarc.MARCReader( fh ) process_flag = True while process_flag is True: try: record = next( reader ) count_good += 1 current_position = fh.tell() last_position = current_position try: bib_number = record['907']['a'][1:] log.debug( 'bib_number, `{}`'.format(bib_number) ) except TypeError: log.debug( 'no bib_number' ) continue bib_level = record['998']['c'] bib_created = get_bib_created( record ) #================================================================== # Count cat edits #================================================================== cat_date = utility_code.convert_date(record['998']['b']) cat_stat = CatStat(record) #Count cataloging edits #Store needed fields. marc_995 = record.get_fields('995') mat_type = cat_stat.mat_type() source = cat_stat.cat_type() #Batch edit notes stored here. marc_910 = record.get_fields('910') #Count the batch load info this_batch_edit = count_batch_edits( bib_number, bib_created, mat_type, marc_910, cataloging_edit_count, source ) cataloging_edit_count.update(this_batch_edit) #Count individual edits added by staff. this_cat_edit = count_cataloging_edits(bib_number, mat_type, marc_995, cataloging_edit_count, source) cataloging_edit_count.update(this_cat_edit) #================================================================== # Count accessions based off item fields. #================================================================== items = record.get_fields('945') #Count the volumes #This will be dict with a named tuple as a key. this_count = count_volumes(items, cat_date, mat_type, existing_items, location_format_map) #We won't be counting everything - skipping some old items. if this_count is None: continue #Pull the volume and title count from the accessions key. this_vol = this_count['volumes'] this_title = this_count['titles'] #Add the title count for k, title in this_title.items(): title_count[k] = title_count.get(k, 0) + title #Add the volume count #Iterate through item counts and update for k, vol in this_vol.items(): volume_count[k] = volume_count.get(k, 0) + vol except Exception as e: ## info level to handle console output log.info( 'exception accessing record, ```{count}```; tell-count, ```{tell}```'.format(count=count_processed, tell=fh.tell() ) ) log.info( 'exception in file, ```{fl}```\n; info-a, ```{err_a}```\ninfo-b, ```{err_b}```'.format( fl=marc_file, err_a=e, err_b=repr(e) ) ) count_bad += 1 current_position = fh.tell() segment_to_review_byte_count = current_position - last_position fh.seek( last_position ) segment_to_review = fh.read( segment_to_review_byte_count ) log.info( 'segment_to_review, ```{}```'.format(segment_to_review) ) ## TODO: write these to a separate file fh.seek( current_position ) last_position = current_position if fh.tell() == file_size: process_flag = False count_processed += 1 if count_processed % 10000 == 0: log.info( '`{}` records processed'.format(count_processed) ) # if count_processed > 10000: # break end = datetime.datetime.now() ## warning level really just for console output log.warning( 'summary for marc file, ```{}```'.format(marc_file) ) log.warning( 'count_processed, `{}`'.format(count_processed) ) log.warning( 'count_good_encoding, `{}`'.format(count_good) ) if count_bad > 0: bad_msg = 'count_bad_encoding, `{}`; problem-segments are in log'.format( count_bad ) else: bad_msg = 'count_bad_encoding, `{}`'.format( count_bad ) log.warning( bad_msg ) log.warning( 'time_taken, `{}`'.format(end-start) ) log.info( 'cataloging_edit_count, ```{}```'.format( pprint.pformat(cataloging_edit_count) ) ) log.info( 'title_count, ```{}```'.format( pprint.pformat(title_count) ) ) log.info( 'volume_count dct, ```{}```'.format( pprint.pformat(volume_count) ) ) return_tpl = ( cataloging_edit_count, title_count, volume_count ) return return_tpl
def summary(self, marc_file): """Harvests data points from exported MARC fields. Date counts will include the date of a given harvest.""" from tech_services_reports.utility_code import convert_date, CatStat from tech_services_reports.models import Accession, Cataloging, CatEdit, Harvest from datetime import date from pymarc import MARCReader #Dicts to store counts cataloging_edit_count = {} cataloging_count = {} title_count = {} volume_count = {} #Find items already counted. #Add logic to skip counted items. existing_items = self.counted_items() #Find last harvest. #Last harvest is only used as a date for scanning MARC records. #This should speed up loading but data is stored and aggregated #in the database by item create date and cat date without reference #to harvested date. last_harvest = self.last_harvest last_harvest_date = last_harvest.date this_harvest, created = Harvest.objects.get_or_create( date=date.today()) this_harvest_date = this_harvest.date #Loop through marc records. for record in MARCReader(file(marc_file)): try: bib_number = record['907']['a'][1:] except TypeError: print >> sys.stderr, "No bib number" print >> sys.stderr, record continue bib_level = record['998']['c'] bib_created = convert_date(record['998']['b']) cat_date = convert_date(record['998']['b']) cat_stat = CatStat(record) # if cat_date: #No longer using cataloging table. All cataloging info #will be stored in CatEdits. #Count cataloging # if cat_date > last_harvest_date: # if cat_date <= this_harvest_date: # this_cat = self.count_cataloging(cat_date, # bib_level, # cat_stat, # cataloging_count) # cataloging_count.update(this_cat) # #Count cataloging edits marc_995 = record.get_fields('995') mat_type = cat_stat.mat_type() source = cat_stat.cat_type() marc_910 = record.get_fields('910') #Count the batch load info this_batch_edit = self.count_batch_edits(bib_number, bib_created, mat_type, marc_910, cataloging_edit_count, source) cataloging_edit_count.update(this_batch_edit) this_cat_edit = self.count_cataloging_edits( bib_number, mat_type, marc_995, cataloging_edit_count, source) cataloging_edit_count.update(this_cat_edit) items = record.get_fields('945') #Count the volumes #This will be dict with a named tuple as a key. this_vol = self.count_volumes(items, this_harvest, existing_items) #Skip bibs without attached items. if this_vol == {}: continue #Use the details from the first attached volume for the title count. #Determine if this bib should count as a title. first_item = self.first_item(items) if first_item > last_harvest_date: if first_item <= this_harvest_date: k = this_vol.keys()[0] title_count[k] = title_count.get(k, 0) + 1 #Iterate through item counts and update for k, vol in this_vol.items(): volume_count[k] = volume_count.get(k, 0) + vol #Finish looping through MARC records #Write accession summary to DB #This might have to become a fixture. print >> sys.stderr, "Writing accessions stats to DB." for meta, count in volume_count.items(): obj, created = Accession.objects.get_or_create( number=meta.number, created=meta.created, acquisition_method=meta.acquisition_method, location=meta.location, format=meta.format, defaults={ 'volumes': 0, 'titles': 0 }, ) #Add volume count. Need to add to what is there. One harvest date #will have multiple files, each of which might have accessions or #cataloging on the same day. obj.volumes = obj.volumes + count try: title_val = title_count[meta] except KeyError: title_val = 1 obj.titles = obj.titles + title_val #print>>sys.stderr, obj.volumes, obj.titles obj.save() print >> sys.stderr, "Writing cataloging edits to DB." for meta, count in cataloging_edit_count.items(): cataloger, edate, ctype, bib, mat_type, source = meta obj, created = CatEdit.objects.get_or_create(edit_date=edate, editor=cataloger, type=ctype, bib=bib, mat_type=mat_type, source=source)
def count_volumes(self, marc_items, this_harvest, counted_items): """Create summary accession info for items created within given range.""" from tech_services_reports.utility_code import convert_date, AcquisitionMethod from tech_services_reports.helpers import defaultdict as DD from tech_services_reports.helpers import namedtuple summary = DD(int) #Named tuple used as key for storing totals. #Method names need to match models. Acc = namedtuple( 'acc_summary', ['number', 'created', 'acquisition_method', 'format', 'location'], verbose=False) for item in marc_items: #Get acc note, skip anything without one. item_acc_note = item[settings_app.ITEM_ACC_NOTE] if not item_acc_note: continue try: item_number = item['y'].lstrip('.') except KeyError: print >> sys.stderr, 'no item number? ', item continue #Skip known items if item_number in counted_items: print >> sys.stderr, item_number, ' already counted. skipping.' continue item_created = convert_date(item['z']) item_number = item['y'] #Yes, some item records don't have a created date. if not item_created: print >> sys.stderr, 'no item create date? ', item continue #Skip items from before system was implemented. if item_created.year < settings_app.BEGIN_YEAR: continue if item_created.year == settings_app.BEGIN_YEAR: if item_created.month < settings_app.BEGIN_MONTH: continue # if item_created <= self.last_harvest.date: # continue # #Shouldn't really be possible but you never know. # if item_created > this_harvest.date: # continue #Determine bib's accession date by try: if not item['l']: continue raw_location = item['l'].strip() #Store raw location codes in case building names change in the future. #This will make display tricky. item_location = raw_location #item_location = location_format_map[raw_location]['building'] except KeyError: #item_location = 'unknown' item_location = 'unknown' try: acquisition_method = AcquisitionMethod(item_acc_note).note except NameError, e: print >> sys.stderr, item_created, item_number, item_acc_note continue try: item_format = location_format_map[raw_location]['format'] except KeyError: #print>>sys.stderr, "%s is an unknown location code." % item_location item_format = 'unknown' #Create a tuple for the summary key. if acquisition_method == 'Gift': print item _key = Acc(number=item_number, created=item_created, acquisition_method=acquisition_method, format=item_format, location=item_location) summary[_key] += 1