コード例 #1
0
ファイル: to-csv.py プロジェクト: jze/sfhistory
#!/usr/bin/python

import sys
sys.path += (sys.path[0] + '/..')

import csv
import record
rs = record.AllRecords()

csv_writer = csv.writer(open('entries.csv', 'wb'))
csv_writer.writerow(['photo_id', 'date', 'folder', 'title', 'library_url'])

for r in rs:
    date = record.CleanDate(r.date())
    title = record.CleanTitle(r.title())
    folder = record.CleanFolder(r.location())

    csv_writer.writerow([r.photo_id(), date, folder, title, r.preferred_url])
コード例 #2
0
from collections import defaultdict, OrderedDict
import csv
import json
import record
import re

from ocr import cleaner

# strip leading 'var popular_photos = ' and trailing ';'
popular_photos = json.loads(open('viewer/static/js/popular-photos.js', 'rb').read()[20:-2])
pop_ids = {x['id'] for x in popular_photos}

# strip leading 'var lat_lons = ' and trailing ';'
lat_lon_to_ids = json.loads(open('viewer/static/js/nyc-lat-lons-ny.js', 'rb').read()[15:-1])

rs = record.AllRecords('nyc/photos.pickle')
id_to_record = {r.photo_id(): r for r in rs}

id_to_dims = {}
for photo_id, width, height in csv.reader(open('nyc-image-sizes.txt')):
    id_to_dims[photo_id] = (width, height)

# ocr.json maps "12345b" -> text. We need photo id -> text.
back_id_to_text = json.load(open('ocr/ocr.json', 'rb'))
for k, txt in back_id_to_text.iteritems():
    back_id_to_text[k] = cleaner.clean(txt)
id_to_text = {}
for photo_id in id_to_record.iterkeys():
    back_id = 'book' + re.sub(r'f?(?:-[a-z])?$', 'b', photo_id)
    if back_id in back_id_to_text:
        id_to_text[photo_id] = back_id_to_text[back_id]
コード例 #3
0
# via http://stackoverflow.com/questions/107405/how-do-you-send-a-head-http-request-in-python
class HeadRequest(urllib2.Request):
    def get_method(self):
        return "HEAD"


def GetRedirect(url):
    location_header = 'location: '
    output = subprocess.check_output(['curl', '--silent', '-I', url])
    for line in output.split('\n'):
        if line.startswith(location_header):
            return line[len(location_header):].strip()
    return None


rs = record.AllRecords('records.pickle')
for idx, rec in enumerate(rs):
    digital_id = rec.photo_id()
    # if digital_id[-1] != 'f':
    #   # TODO(danvk): look at these
    #   print 'Skipping id %s' % digital_id
    #   continue

    output_sid = '%s/%s.sid' % (IMAGE_DIR, digital_id)
    output_jpg = '%s/%s.jpg' % (IMAGE_DIR, digital_id)
    if not os.path.exists(output_sid) and not os.path.exists(output_jpg):
        viewer_url = VIEWER_PATTERN % urllib.quote_plus(digital_id)
        # print '%s: %s' % (digital_id, viewer_url)
        # response = urllib2.urlopen(HeadRequest(viewer_url))
        # o = urlparse.urlparse(response.geturl())
        # item_id = urlparse.parse_qs(o.query)['item'][0].strip()
コード例 #4
0
    ok_coders = options.ok_coders.split(',')
    geocoders = [c for c in geocoders if c.name() in ok_coders]
    if len(geocoders) != len(ok_coders):
      sys.stderr.write('Coder mismatch: %s vs %s\n' % (options.ok_coders, ','.join([c.name() for c in geocoders])))
      sys.exit(1)

  # TODO(danvk): does this belong here?
  lat_lon_map = {}
  if options.lat_lon_map:
    for line in file(options.lat_lon_map):
      line = line.strip()
      if not line: continue
      old, new = line.split('->')
      lat_lon_map[old] = new

  rs = record.AllRecords(path=options.pickle_path)
  if options.ids_filter:
    ids = set(options.ids_filter.split(','))
    rs = [r for r in rs if r.photo_id() in ids]

  # Load existing geocodes, if applicable.
  id_to_located_rec = {}
  if options.previous_geocode_json:
    prev_recs = json.load(file(options.previous_geocode_json))
    for rec in prev_recs:
      if 'extracted' in rec and 'latlon' in rec['extracted']:
        x = rec['extracted']
        id_to_located_rec[rec['id']] = (None, x['technique'], {
              'address': x['located_str'],
              'lat': x['latlon'][0],
              'lon': x['latlon'][1]
コード例 #5
0
Very early is pre-1860.
Very late is post-1945.

See https://github.com/danvk/oldnyc/issues/3
'''

import os, sys
parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0,parentdir) 
import re

import record

def extract_dates(date_str):
    return re.findall(r'\b(1[6789]\d\d)\b', date_str)


def mkurl(r):
    return 'http://digitalcollections.nypl.org/items/image_id/%s' % (
            re.sub(r'-[a-z]$', '', r.photo_id()))


if __name__ == '__main__':
    rs = record.AllRecords('nyc/records.pickle')
    for r in rs:
        dstr = re.sub(r'\s+', ' ', r.date())
        if not dstr: continue
        for d in extract_dates(dstr):
            if d < '1860' or d > '1945':
                print '%4s\t%s\t%s' % (d, r.photo_id(), mkurl(r))
コード例 #6
0
Each photo extracted from the original image gets its own record in the new pickle.'''

import os, sys
parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parentdir)

import cPickle
import copy
import record
import json
from collections import defaultdict

assert len(sys.argv) == 4, 'Usage: %s records.pickle photos.json photos.pickle'
_, in_pickle, photos_json, out_pickle = sys.argv

rs = record.AllRecords(in_pickle)
expansions = json.load(file(photos_json))

f = file(out_pickle, "w")
p = cPickle.Pickler(f, 2)

skipped = 0
num_images, num_photos = 0, 0

for idx, r in enumerate(rs):
    digital_id = r.photo_id()
    image_file = '%s.jpg' % digital_id
    if image_file not in expansions:
        skipped += 1
        continue
コード例 #7
0
                  dest='upload_url',
                  default='http://localhost:8080/upload',
                  help='Upload endpoint. Default is local dev_appserver.')
parser.add_option('',
                  '--start_chunk',
                  default=0,
                  dest='start_chunk',
                  type=int,
                  help='Which chunk to start with. Used to resume uploads.')

(options, args) = parser.parse_args()

assert options.pickle_path
assert options.image_sizes_path

all_rs = record.AllRecords(options.pickle_path)

# The JS file has a leading 'var foo = {' and a trailing '};' that we don't want.
lat_lons = {}
if options.lat_lons_js_path:
    js_data = file(options.lat_lons_js_path).readlines()
    js_data = '{' + ''.join(js_data[1:-1]) + '}'
    lat_lons = json.loads(js_data)

    ok_ids = set()
    for ll, images in lat_lons.iteritems():
        for _, _, photo_id in images:
            ok_ids.add(photo_id)

    rs = [r for r in all_rs if r.photo_id() in ok_ids]
else: