if pgenre != 'bio' and not likely and ("Biography" in genres or "Autobiography" in genres): suspicious = True if pgenre != 'bio' and not likely and ("Biography" in subjects or "Autobiography" in subjects): suspicious = True elif pgenre != 'bio' and not likely and ("Description and travel" in genres or "Description and travel" in subjects): suspicious = True elif english < 0.45: suspicious = True if suspicious: row['metadatasuspicious'] = True else: row['metadatasuspicious'] = '' row['inferreddate'] = utils.date_row(row) row['rawprobability'] = row.pop('probability') row['englishtop1000pct'] = row.pop('englishpct') rows[pgenre].append(row) # now we have all the bio, dra, fic, and poe # write them to file write_genres(rows, firstfile) firstfile = False
selecteddates[htid] = date selected.append(htid) bydate = dict() authors = dict() titles = dict() datesbyhtid = dict() with open('/Users/tunder/work/genre/metadata/poemeta.csv', encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: htid = row['htid'] authors[htid] = row['author'] titles[htid] = row['title'] date = utils.date_row(row) datesbyhtid[htid] = date if htid in selected: continue if date in bydate: bydate[date].append(htid) else: bydate[date] = [htid] controlset = set() skip = int(input('Skip how many? ')) for theid in selected[skip:]: date = selecteddates[theid] print(theid) print(date)
import csv, os from collections import Counter import SonicScrewdriver as utils tagas = input("Tag volumes in this session as? ") metadata = dict() authordict = dict() all_docids = set() metasource = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv' with open(metasource, encoding = 'utf-8') as f: reader = csv.DictReader(f, delimiter = '\t') for row in reader: docid = row['HTid'] row['date'] = utils.date_row(row) metadata[docid] = row authorstring = row['author'] authorstring = authorstring.replace(',', ' ') authorstring = authorstring.replace('.', ' ') authorwords = authorstring.lower().split() for word in authorwords: if word not in authordict: authordict[word] = set() authordict[word].add(docid) all_docids.add(docid) metaout = '/Users/tunder/Dropbox/fiction/meta/genremeta.csv' with open(metaout, encoding = 'utf-8') as f:
newfic = [] oldfic = [] with open(inpath, encoding = 'utf-8') as f: reader = csv.DictReader(f, delimiter = '\t') fieldnames = reader.fieldnames for row in reader: genre = row['sampledas'] if genre != 'bio': continue # right now we're running on biography authdate = row['authordate'] birth, death = cabinet.parse_authordate(authdate) date = utils.date_row(row) if death > 0 and death < 1920: oldfic.append(row) continue elif death > 0 and death + 20 < date: oldfic.append(row) continue else: stdauthor = standardize_name(row['author']) row['stdauthor'] = stdauthor newfic.append(row) def numeric_only(astring): numonly = '' for character in astring: if character.isdigit():
'docid', 'recordid', 'oclc', 'locnum', 'includedbc', 'author', 'authordate', 'imprint', 'inferreddate', 'place', 'enumcron', 'subjects', 'genres', 'title' ] print() print('A total of ' + str(counter) + ' volumes were scanned.') print() print('A total of ' + str(fuzzycounter) + ' volumes were fuzzymatched.') print() with open('../rawdata/sf_intersection.csv', mode='w', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() for row in allsf: if 'htid' in row: row['docid'] = row['htid'] row['includedbc'] = reasons[row['docid']] if 'inferreddate' not in row: row['inferreddate'] = utils.date_row(row) if 'authordate' not in row: row['authordate'] = '' if 'genres' not in row: genres = '' if ' / |' in row['title']: row['title'] = row['title'].split(' / |')[0] writer.writerow(row)
selectedmeta[htid]['firstpub'] = firstpub bydate = dict() authors = dict() titles = dict() datesbyhtid = dict() with open('/Users/tunder/work/genre/metadata/ficmeta.csv', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: htid = row['htid'] authors[htid] = row['author'] titles[htid] = row['title'] date = utils.date_row(row) datesbyhtid[htid] = date if htid in selected: continue if date in bydate: bydate[date].append(htid) else: bydate[date] = [htid] controlset = set() controlmeta = dict() usedfromselected = list() skip = int(input('Skip how many? ')) for theid in selected[skip:]: date = selecteddates[theid]
improved += 1 # if newauthor != author: # print('New author: ', newauthor) # print('old author: ', author) # user = input('? ') # if newtitle != title: # print('New title: ', newtitle) # print('old title: ', title) # user = input('? ') ficmeta[docid]['title'] = newtitle ficmeta[docid]['author'] = newauthor ficmeta[docid]['inferreddate'] = utils.date_row(row) fieldnames.insert(13, 'genres') fieldnames.insert(13, 'geographics') fieldnames.insert(13, 'contents') fieldnames.insert(6, 'inferreddate') fieldnames.insert(5, 'authordate') fieldnames.pop(0) fieldnames.insert(0, 'docid') missing = ficset - founddocs with open('enrichedpre1923ficmeta.tsv', mode = 'w', encoding = 'utf-8') as f: writer = csv.DictWriter(f, fieldnames = fieldnames, delimiter = '\t') writer.writeheader() for key, row in ficmeta.items():