def test_pickle_it(): pickle_path = os.path.join(MY_PATH, 'output/some_data.pickle') if os.path.exists(pickle_path): os.remove(pickle_path) util.pickle_it(some_data, pickle_path) x = util.unpickle_it(pickle_path) assert x[0] == some_data[0]
def main(): """Get IA collection specific to serial (Columbia Library Columns). """ sheet_id = '1yTDyd5GQFEsVBiKOnt5T1ejBdXhxhmXVUn6jQ-dg_5I' sheet_tab = 'ColumbiaColumns' the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z') the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z') the_err_sheet = dataSheet(sheet_id, 'errors!A:Z') output_folder = 'output/ia/' feed_stem = 'ia_clc_feed' collection_title = 'Columbia Library Columns' abbr = 'clc' pickle_path = output_folder + feed_stem + '.pickle' the_input = the_in_sheet.getData() heads = the_input.pop(0) the_records = [{'bibid': r[0], 'id':r[2], 'label': r[3]} for r in the_input] feed_data = ia.extract_data(the_records, feed_stem, collection_title) feed_data_new = {'errors': feed_data['errors'], 'data': []} for e in feed_data['data']: new_entry = e des = new_entry['description'] des_new = [] for d in des: if '<a' not in d: des_new.append(d) new_entry['description'] = des_new feed_data_new['data'].append(new_entry) # pprint(feed_data_new) # Save to pickle. print('Saving ' + str(len(feed_data_new['data']) ) + ' records to ' + pickle_path) # util.pickle_it(feed_data_new['data'], pickle_path) util.pickle_it(feed_data_new, pickle_path) # Report any extraction errors the_out_sheet.appendData(feed_data['errors']) # Generate XML x = ia.build_feed(output_folder + feed_stem + '.pickle', abbr) # report any build errors/warnings the_err_sheet.appendData(x)
def get_collection(sheet_id, sheet_tab, feed_stem, collection_title, multipart=False): """Get Internet Archive collection and save to pickle. Args: sheet_id (str): Google sheet id sheet_tab (str): Google sheet tab name feed_stem (str): abbreviation to be used in file naming and feed identification collection_title (str): Title of collection (e.g., Medical Heritage Library) multipart (bool, optional): Incl/exclude multi-volume works. Defaults to False. """ the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z') the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z') pickle_path = OUT_PATH + feed_stem + '.pickle' # get a list of bibids and ia ids to process the_inputs = the_in_sheet.getData() the_inputs.pop(0) # remove head row print(str(len(the_inputs)) + ' records in ' + collection_title + '...') the_records = [] for i in the_inputs: # the_920s = i[6:] # get arbitrary number of 920s for this row the_920s = i[4].split(';') # get arbitrary number of 920s for this row rl = [] for r in the_920s: if 'archive.org' in r: rp = ia.parse_920(r) # Only add if id != None. if bool(rp['id']): rl.append({ 'bibid': i[0], 'id': rp['id'], 'label': rp['label'] }) # If we are allowing multi-volume works, add all; # otherwise, only add to list if it is a monograph. if len(rl) == 1 or multipart is True: the_records += rl feed_data = ia.extract_data(the_records, feed_stem, collection_title) print('Saving ' + str(len(feed_data['data'])) + ' records to ' + pickle_path) util.pickle_it(feed_data, pickle_path) the_out_sheet.appendData(feed_data['errors'])
def get_collection(sheet_id, sheet_tab, feed_stem, collection_title, multipart=False): the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z') the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z') pickle_path = output_dir + '/' + feed_stem + '.pickle' # get a list of bibids and ia ids to process the_inputs = the_in_sheet.getData() the_inputs.pop(0) # remove head row the_records = [] for i in the_inputs: the_920s = i[4].split(';') # get arbitrary number of 920s for this row rl = [] for r in the_920s: # if 'oapen.org/record' in r: if 'library.oapen.org/handle/20.500.12657/' in r: rp = parse_920(r) rl.append({ 'bibid': i[0], 'id': rp['id'], 'label': rp['label'] }) # If we are allowing multi-volume works, add all; # otherwise, only add to list if it is a monograph. if len(rl) == 1 or multipart is True: the_records += rl elif len(rl) > 1: print("WARNING: " + str(i[0]) + " has multiple volumes. Skipping!") else: print("WARNING: could not find OAPEN record in " + str(i[0]) + ". Skipping!") feed_data = extract_data(the_records, feed_stem, collection_title) print('Saving ' + str(len(feed_data['data'])) + ' records to ' + pickle_path) util.pickle_it(feed_data, pickle_path) # print(feed_data['data']) pprint(feed_data['errors']) the_out_sheet.appendData(feed_data['errors'])
def main(): the_sheet = dataSheet('1D2E5Sm3qZdU3MGXk7q2XxfBpQS1iqauQm19f_y9aTbM', 'Sheet1!A:Z') out_path = os.path.join(MY_PATH, 'output_test/springer/springer_subjects.pickle') subject_data = get_subjects(the_sheet) print(pickle_it(subject_data, out_path)) # pprint(subject_data) quit()
def get_linglong(): """Get the linglong data from IA and save in one pickle per year (vol). """ the_sheet = dataSheet(SHEET_ID, 'LingLong!A:Z') the_input = the_sheet.getData() heads = the_input.pop(0) the_data = [] for y in range(1931, 1938): the_data.append({ 'vol': y, 'items': [{ 'bibid': r[0], 'id': r[2], 'label': r[3] } for r in the_input if r[1] == str(y)] }) # pprint(the_data) for vol_data in the_data: print(' ') print(vol_data['vol']) feed_stem = 'ia_ll_' + str(vol_data['vol']) pickle_path = OUTPUT_FOLDER + '/' + feed_stem + '.pickle' # print(vol_data['items']) feed_data = ia.extract_data(vol_data['items'], feed_stem, 'Ling Long (' + str(vol_data['vol']) + ')') pprint(feed_data['errors']) print('Saving ' + str(len(feed_data['data'])) + ' records to ' + pickle_path) util.pickle_it(feed_data, pickle_path)
# Script to harvest oapen data and transform to OPDS. # TODO: everything! import requests import json from pprint import pprint import dcps_utils as util the_collections = [ {"name": "ERC", "url": "http://library.oapen.org/rest/search?query=oapen.collection:%22European%20Research%20Council%22&expand=metadata,bitstreams&limit=1000"}] for c in the_collections: collection_data = json.loads(requests.get(c["url"]).text) util.pickle_it(collection_data, 'output/' + "oapen_" + c["name"] + "_data.pickle")