Ejemplo n.º 1
0
            if pgenre != 'bio' and not likely and ("Biography" in genres or "Autobiography" in genres):
                suspicious = True
            if pgenre != 'bio' and  not likely and ("Biography" in subjects or "Autobiography" in subjects):
                suspicious = True
            elif pgenre != 'bio' and not likely and ("Description and travel" in genres or "Description and travel" in subjects):
                suspicious = True
            elif english < 0.45:
                suspicious = True

            if suspicious:
                row['metadatasuspicious'] = True
            else:
                row['metadatasuspicious'] = ''

            row['inferreddate'] = utils.date_row(row)
            row['rawprobability'] = row.pop('probability')
            row['englishtop1000pct'] = row.pop('englishpct')

            rows[pgenre].append(row)

    # now we have all the bio, dra, fic, and poe
    # write them to file

    write_genres(rows, firstfile)
    firstfile = False




            selecteddates[htid] = date
            selected.append(htid)

bydate = dict()
authors = dict()
titles = dict()
datesbyhtid = dict()

with open('/Users/tunder/work/genre/metadata/poemeta.csv', encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        htid = row['htid']
        authors[htid] = row['author']
        titles[htid] = row['title']

        date = utils.date_row(row)
        datesbyhtid[htid] = date
        if htid in selected:
            continue
        if date in bydate:
            bydate[date].append(htid)
        else:
            bydate[date] = [htid]

controlset = set()

skip = int(input('Skip how many? '))
for theid in selected[skip:]:
    date = selecteddates[theid]
    print(theid)
    print(date)
Ejemplo n.º 3
0
import csv, os
from collections import Counter
import SonicScrewdriver as utils

tagas = input("Tag volumes in this session as? ")

metadata = dict()
authordict = dict()
all_docids = set()

metasource = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
with open(metasource, encoding = 'utf-8') as f:
    reader = csv.DictReader(f, delimiter = '\t')
    for row in reader:
        docid = row['HTid']
        row['date'] = utils.date_row(row)
        metadata[docid] = row
        authorstring = row['author']
        authorstring = authorstring.replace(',', ' ')
        authorstring = authorstring.replace('.', ' ')
        authorwords = authorstring.lower().split()
        for word in authorwords:
            if word not in authordict:
                authordict[word] = set()

            authordict[word].add(docid)

        all_docids.add(docid)

metaout = '/Users/tunder/Dropbox/fiction/meta/genremeta.csv'
with open(metaout, encoding = 'utf-8') as f:
Ejemplo n.º 4
0
newfic = []
oldfic = []

with open(inpath, encoding = 'utf-8') as f:
    reader = csv.DictReader(f, delimiter = '\t')
    fieldnames = reader.fieldnames
    for row in reader:
        genre = row['sampledas']
        if genre != 'bio':
            continue
            # right now we're running on biography

        authdate = row['authordate']
        birth, death = cabinet.parse_authordate(authdate)
        date = utils.date_row(row)
        if death > 0 and death < 1920:
            oldfic.append(row)
            continue
        elif death > 0 and death + 20 < date:
            oldfic.append(row)
            continue
        else:
            stdauthor = standardize_name(row['author'])
            row['stdauthor'] = stdauthor
            newfic.append(row)

def numeric_only(astring):
    numonly = ''
    for character in astring:
        if character.isdigit():
Ejemplo n.º 5
0
    'docid', 'recordid', 'oclc', 'locnum', 'includedbc', 'author',
    'authordate', 'imprint', 'inferreddate', 'place', 'enumcron', 'subjects',
    'genres', 'title'
]

print()
print('A total of ' + str(counter) + ' volumes were scanned.')
print()
print('A total of ' + str(fuzzycounter) + ' volumes were fuzzymatched.')
print()

with open('../rawdata/sf_intersection.csv', mode='w', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
    writer.writeheader()
    for row in allsf:
        if 'htid' in row:
            row['docid'] = row['htid']
        row['includedbc'] = reasons[row['docid']]

        if 'inferreddate' not in row:
            row['inferreddate'] = utils.date_row(row)
        if 'authordate' not in row:
            row['authordate'] = ''
        if 'genres' not in row:
            genres = ''

        if ' / |' in row['title']:
            row['title'] = row['title'].split(' / |')[0]

        writer.writerow(row)
Ejemplo n.º 6
0
            selectedmeta[htid]['firstpub'] = firstpub

bydate = dict()
authors = dict()
titles = dict()
datesbyhtid = dict()

with open('/Users/tunder/work/genre/metadata/ficmeta.csv',
          encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        htid = row['htid']
        authors[htid] = row['author']
        titles[htid] = row['title']

        date = utils.date_row(row)
        datesbyhtid[htid] = date
        if htid in selected:
            continue
        if date in bydate:
            bydate[date].append(htid)
        else:
            bydate[date] = [htid]

controlset = set()
controlmeta = dict()
usedfromselected = list()

skip = int(input('Skip how many? '))
for theid in selected[skip:]:
    date = selecteddates[theid]
Ejemplo n.º 7
0
                improved += 1


            # if newauthor != author:
            #     print('New author: ', newauthor)
            #     print('old author: ', author)
            #     user = input('? ')

            # if newtitle != title:
            #     print('New title: ', newtitle)
            #     print('old title: ', title)
            #     user = input('? ')

            ficmeta[docid]['title'] = newtitle
            ficmeta[docid]['author'] = newauthor
            ficmeta[docid]['inferreddate'] = utils.date_row(row)

fieldnames.insert(13, 'genres')
fieldnames.insert(13, 'geographics')
fieldnames.insert(13, 'contents')
fieldnames.insert(6, 'inferreddate')
fieldnames.insert(5, 'authordate')
fieldnames.pop(0)
fieldnames.insert(0, 'docid')

missing = ficset - founddocs

with open('enrichedpre1923ficmeta.tsv', mode = 'w', encoding = 'utf-8') as f:
    writer = csv.DictWriter(f, fieldnames = fieldnames, delimiter = '\t')
    writer.writeheader()
    for key, row in ficmeta.items():