#!/usr/bin/python import sys sys.path += (sys.path[0] + '/..') import csv import record rs = record.AllRecords() csv_writer = csv.writer(open('entries.csv', 'wb')) csv_writer.writerow(['photo_id', 'date', 'folder', 'title', 'library_url']) for r in rs: date = record.CleanDate(r.date()) title = record.CleanTitle(r.title()) folder = record.CleanFolder(r.location()) csv_writer.writerow([r.photo_id(), date, folder, title, r.preferred_url])
from collections import defaultdict, OrderedDict import csv import json import record import re from ocr import cleaner # strip leading 'var popular_photos = ' and trailing ';' popular_photos = json.loads(open('viewer/static/js/popular-photos.js', 'rb').read()[20:-2]) pop_ids = {x['id'] for x in popular_photos} # strip leading 'var lat_lons = ' and trailing ';' lat_lon_to_ids = json.loads(open('viewer/static/js/nyc-lat-lons-ny.js', 'rb').read()[15:-1]) rs = record.AllRecords('nyc/photos.pickle') id_to_record = {r.photo_id(): r for r in rs} id_to_dims = {} for photo_id, width, height in csv.reader(open('nyc-image-sizes.txt')): id_to_dims[photo_id] = (width, height) # ocr.json maps "12345b" -> text. We need photo id -> text. back_id_to_text = json.load(open('ocr/ocr.json', 'rb')) for k, txt in back_id_to_text.iteritems(): back_id_to_text[k] = cleaner.clean(txt) id_to_text = {} for photo_id in id_to_record.iterkeys(): back_id = 'book' + re.sub(r'f?(?:-[a-z])?$', 'b', photo_id) if back_id in back_id_to_text: id_to_text[photo_id] = back_id_to_text[back_id]
# via http://stackoverflow.com/questions/107405/how-do-you-send-a-head-http-request-in-python class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" def GetRedirect(url): location_header = 'location: ' output = subprocess.check_output(['curl', '--silent', '-I', url]) for line in output.split('\n'): if line.startswith(location_header): return line[len(location_header):].strip() return None rs = record.AllRecords('records.pickle') for idx, rec in enumerate(rs): digital_id = rec.photo_id() # if digital_id[-1] != 'f': # # TODO(danvk): look at these # print 'Skipping id %s' % digital_id # continue output_sid = '%s/%s.sid' % (IMAGE_DIR, digital_id) output_jpg = '%s/%s.jpg' % (IMAGE_DIR, digital_id) if not os.path.exists(output_sid) and not os.path.exists(output_jpg): viewer_url = VIEWER_PATTERN % urllib.quote_plus(digital_id) # print '%s: %s' % (digital_id, viewer_url) # response = urllib2.urlopen(HeadRequest(viewer_url)) # o = urlparse.urlparse(response.geturl()) # item_id = urlparse.parse_qs(o.query)['item'][0].strip()
ok_coders = options.ok_coders.split(',') geocoders = [c for c in geocoders if c.name() in ok_coders] if len(geocoders) != len(ok_coders): sys.stderr.write('Coder mismatch: %s vs %s\n' % (options.ok_coders, ','.join([c.name() for c in geocoders]))) sys.exit(1) # TODO(danvk): does this belong here? lat_lon_map = {} if options.lat_lon_map: for line in file(options.lat_lon_map): line = line.strip() if not line: continue old, new = line.split('->') lat_lon_map[old] = new rs = record.AllRecords(path=options.pickle_path) if options.ids_filter: ids = set(options.ids_filter.split(',')) rs = [r for r in rs if r.photo_id() in ids] # Load existing geocodes, if applicable. id_to_located_rec = {} if options.previous_geocode_json: prev_recs = json.load(file(options.previous_geocode_json)) for rec in prev_recs: if 'extracted' in rec and 'latlon' in rec['extracted']: x = rec['extracted'] id_to_located_rec[rec['id']] = (None, x['technique'], { 'address': x['located_str'], 'lat': x['latlon'][0], 'lon': x['latlon'][1]
Very early is pre-1860. Very late is post-1945. See https://github.com/danvk/oldnyc/issues/3 ''' import os, sys parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0,parentdir) import re import record def extract_dates(date_str): return re.findall(r'\b(1[6789]\d\d)\b', date_str) def mkurl(r): return 'http://digitalcollections.nypl.org/items/image_id/%s' % ( re.sub(r'-[a-z]$', '', r.photo_id())) if __name__ == '__main__': rs = record.AllRecords('nyc/records.pickle') for r in rs: dstr = re.sub(r'\s+', ' ', r.date()) if not dstr: continue for d in extract_dates(dstr): if d < '1860' or d > '1945': print '%4s\t%s\t%s' % (d, r.photo_id(), mkurl(r))
Each photo extracted from the original image gets its own record in the new pickle.''' import os, sys parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, parentdir) import cPickle import copy import record import json from collections import defaultdict assert len(sys.argv) == 4, 'Usage: %s records.pickle photos.json photos.pickle' _, in_pickle, photos_json, out_pickle = sys.argv rs = record.AllRecords(in_pickle) expansions = json.load(file(photos_json)) f = file(out_pickle, "w") p = cPickle.Pickler(f, 2) skipped = 0 num_images, num_photos = 0, 0 for idx, r in enumerate(rs): digital_id = r.photo_id() image_file = '%s.jpg' % digital_id if image_file not in expansions: skipped += 1 continue
dest='upload_url', default='http://localhost:8080/upload', help='Upload endpoint. Default is local dev_appserver.') parser.add_option('', '--start_chunk', default=0, dest='start_chunk', type=int, help='Which chunk to start with. Used to resume uploads.') (options, args) = parser.parse_args() assert options.pickle_path assert options.image_sizes_path all_rs = record.AllRecords(options.pickle_path) # The JS file has a leading 'var foo = {' and a trailing '};' that we don't want. lat_lons = {} if options.lat_lons_js_path: js_data = file(options.lat_lons_js_path).readlines() js_data = '{' + ''.join(js_data[1:-1]) + '}' lat_lons = json.loads(js_data) ok_ids = set() for ll, images in lat_lons.iteritems(): for _, _, photo_id in images: ok_ids.add(photo_id) rs = [r for r in all_rs if r.photo_id() in ok_ids] else: