Example #1
0
def test_merge_lines():
    txt = cleaner.remove_warnings(samples[0])
    eq_('''120-130 Baxter Street, west side, at, adjoining, and south of the S.W. corner of Hester Street. The view also shows 200-202 Hester Street, at and adjoining the S.W. corner of Baxter Street.
About 1925.
''',
    cleaner.merge_lines(txt))

    txt = cleaner.remove_warnings(samples[1])
    eq_('''The 440 Club was a popular downstairs watering hole for local office inhabitants. So named - 4 East 40th. After losing 1ease it became a restaurant which failed within a short time. Bldg will be demolished to sske way for new (proposed) hoae of Ihe Republic National Bank.
Earl Christian, photographer (NnPL)
Aug.1979
''',
    cleaner.merge_lines(txt))

    txt = cleaner.remove_warnings(samples[2])
    eq_(txt, cleaner.merge_lines(txt))

    # XXX: it would be better if the "Sfation" line were merged up.
    txt = cleaner.remove_warnings(samples[3])
    eq_('''(1)
Eleventh Avenue at northeast cornsr of 57th Street, and showing buildings on north west side of Avenue.
May 20, 1927.
(2)
Eleventh Avenue northeast corner 57th Street, Gas
Sfation in plain site.
P.L.Speer.
NO REPRODU IONS.  September 19, 1933.
''',
    cleaner.merge_lines(txt))

    txt = cleaner.remove_warnings(samples[4])
    eq_('''308 E. 37th Street, south side, east of Second Avenue. The easterly half of St. Gabriel's Roman Catholic Church as viewed northwaro from the alter rail. TCo the lett appears the organ loft and the large stained window that is over the entrance.
.January 11, 1939
Somach Yhoto Service
New York City Tunnel Authority
''',
    cleaner.merge_lines(txt))

    # This one has a hyphenated phrase which gets joined
    txt = cleaner.remove_warnings(samples[5])
    eq_('''New Dock Street, west side, north of Water Street, showing W. P. S. workmen erecting a new storehouse for the New York City Department of Purchase under the Brooklyn Bridge.
June 17, 1936
Works Progress Administration
Project 65-97-68
''',
    cleaner.merge_lines(txt))

    # This one has a hyphen followed by a short line.
    txt = cleaner.remove_warnings(samples[6])
    eq_('''Coney Island: The beach and boardwalk on a hot August afternoon, looking east.
1939
Alexaneer Alland
Ap7-:  Wg ~E 33~4
''',
    cleaner.merge_lines(txt))

    # This has no trailing newline
    txt = 'Hello\nThere'
    eq_('Hello\nThere', cleaner.clean(txt))
def test_merge_lines():
    txt = cleaner.remove_warnings(samples[0])
    eq_(
        '''120-130 Baxter Street, west side, at, adjoining, and south of the S.W. corner of Hester Street. The view also shows 200-202 Hester Street, at and adjoining the S.W. corner of Baxter Street.
About 1925.
''', cleaner.merge_lines(txt))

    txt = cleaner.remove_warnings(samples[1])
    eq_(
        '''The 440 Club was a popular downstairs watering hole for local office inhabitants. So named - 4 East 40th. After losing 1ease it became a restaurant which failed within a short time. Bldg will be demolished to sske way for new (proposed) hoae of Ihe Republic National Bank.
Earl Christian, photographer (NnPL)
Aug.1979
''', cleaner.merge_lines(txt))

    txt = cleaner.remove_warnings(samples[2])
    eq_(txt, cleaner.merge_lines(txt))

    # XXX: it would be better if the "Sfation" line were merged up.
    txt = cleaner.remove_warnings(samples[3])
    eq_(
        '''(1)
Eleventh Avenue at northeast cornsr of 57th Street, and showing buildings on north west side of Avenue.
May 20, 1927.
(2)
Eleventh Avenue northeast corner 57th Street, Gas
Sfation in plain site.
P.L.Speer.
NO REPRODU IONS.  September 19, 1933.
''', cleaner.merge_lines(txt))

    txt = cleaner.remove_warnings(samples[4])
    eq_(
        '''308 E. 37th Street, south side, east of Second Avenue. The easterly half of St. Gabriel's Roman Catholic Church as viewed northwaro from the alter rail. TCo the lett appears the organ loft and the large stained window that is over the entrance.
.January 11, 1939
Somach Yhoto Service
New York City Tunnel Authority
''', cleaner.merge_lines(txt))

    # This one has a hyphenated phrase which gets joined
    txt = cleaner.remove_warnings(samples[5])
    eq_(
        '''New Dock Street, west side, north of Water Street, showing W. P. S. workmen erecting a new storehouse for the New York City Department of Purchase under the Brooklyn Bridge.
June 17, 1936
Works Progress Administration
Project 65-97-68
''', cleaner.merge_lines(txt))

    # This one has a hyphen followed by a short line.
    txt = cleaner.remove_warnings(samples[6])
    eq_(
        '''Coney Island: The beach and boardwalk on a hot August afternoon, looking east.
1939
Alexaneer Alland
Ap7-:  Wg ~E 33~4
''', cleaner.merge_lines(txt))

    # This has no trailing newline
    txt = 'Hello\nThere'
    eq_('Hello\nThere', cleaner.clean(txt))
Example #3
0
pop_ids = {x['id'] for x in popular_photos}

# strip leading 'var lat_lons = ' and trailing ';'
lat_lon_to_ids = json.loads(open('viewer/static/js/nyc-lat-lons-ny.js', 'rb').read()[15:-1])

rs = record.AllRecords('nyc/photos.pickle')
id_to_record = {r.photo_id(): r for r in rs}

id_to_dims = {}
for photo_id, width, height in csv.reader(open('nyc-image-sizes.txt')):
    id_to_dims[photo_id] = (width, height)

# ocr.json maps "12345b" -> text. We need photo id -> text.
back_id_to_text = json.load(open('ocr/ocr.json', 'rb'))
for k, txt in back_id_to_text.iteritems():
    back_id_to_text[k] = cleaner.clean(txt)
id_to_text = {}
for photo_id in id_to_record.iterkeys():
    back_id = 'book' + re.sub(r'f?(?:-[a-z])?$', 'b', photo_id)
    if back_id in back_id_to_text:
        id_to_text[photo_id] = back_id_to_text[back_id]
back_id_to_text = None  # clear


def decode(b):
    try:
        return b.decode('utf8')
    except UnicodeDecodeError:
        return b.decode(chardet.detect(b)['encoding'])

Example #4
0
id_to_rotation = json.load(open('analysis/rotations/rotations.json'))

# ocr.json maps "12345b" -> text. We need photo id -> text.
back_id_to_text = json.load(open('ocr/ocr.json', 'rb'))
manual_fixes = json.load(open('ocr/feedback/fixes.json', 'rb'))
id_to_text = {}
for photo_id in id_to_record.iterkeys():
    back_id = re.sub(r'f?(?:-[a-z])?$', 'b', photo_id)
    book_id = 'book' + back_id
    if book_id in back_id_to_text:
        id_to_text[photo_id] = back_id_to_text[book_id]
    if back_id in manual_fixes:
        id_to_text[photo_id] = manual_fixes[back_id]

for k, txt in id_to_text.iteritems():
    id_to_text[k] = cleaner.clean(txt)

back_id_to_text = None  # clear


def image_url(photo_id, is_thumb):
    degrees = id_to_rotation.get(photo_id)
    if not degrees:
        return 'http://oldnyc-assets.nypl.org/%s/%s.jpg' % (
            'thumb' if is_thumb else '600px', photo_id)
    else:
        return 'http://www.oldnyc.org/rotated-assets/%s/%s.%s.jpg' % (
            'thumb' if is_thumb else '600px', photo_id, degrees)


def decode(b):