import csv from collections import Counter import SonicScrewdriver as utils ficids = set() meta = dict() ficsource = "/Volumes/TARDIS/work/fiction/metadata/fiction_metadata.csv" with open(ficsource, encoding="utf-8") as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames for row in reader: htid = row["htid"] dirtyhtid = utils.dirty_pairtree(htid) ficids.add(dirtyhtid) meta[dirtyhtid] = row metasource = "/Volumes/TARDIS/work/metadata/MergedMonographs.tsv" mysterysubjects = Counter() scifisubjects = Counter() gothsubjects = Counter() gothclues = ["ghost stories", "gothic revival", "horror"] genretags = dict() def add_tag(genretags, htid, tagtoadd): if htid not in genretags: genretags[htid] = set()
# getidstoadd import SonicScrewdriver as utils import os with open('/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding = 'utf-8') as f: filelines = f.readlines() ids2get = [x.split('\t')[0] for x in filelines] fileswehave = os.listdir('/Users/tunder/Dropbox/GenreProject/python/granger/elite/') idswehave = set([x.replace('.poe.tsv','') for x in fileswehave if x.endswith('.poe.tsv')]) with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode = 'w', encoding = 'utf-8') as f: for anid in ids2get: if anid not in idswehave and utils.clean_pairtree(anid) not in idswehave: f.write(utils.dirty_pairtree(anid) + '\n')
if thisreader not in readerowners[f]: readerowners[f].append(thisreader) paths[f].append(thispath) print(len(tagset)) allfiles = tagset # This is a list of all the filenames (note, filenames not docids) # that we found in the /readers sourcedir. train1 = pd.read_csv('../bzipmeta.csv', dtype = 'object', index_col = 'docid') tidx = set(train1.index.values) for filename in allfiles: docid = filename.replace('.csv', '') if utils.dirty_pairtree(docid) not in tidx: print(docid) genrestocheck = ['fic', 'poe'] equivalences = {'non', 'bio', 'other'} volumesingenre = dict() for g in genrestocheck: volumesingenre[g] = [] alldocids = set() for filename, owners in readerowners.items(): path = paths[filename][0] if 'metadat' in filename: print(filename)
import SonicScrewdriver as utils with open('/Users/tunder/Dropbox/GenreProject/metadata/getficids1899.txt', encoding = 'utf-8') as f: ids = [x.rstrip() for x in f.readlines()] newids = list() for anid in ids: newid = utils.dirty_pairtree(anid) newids.append(newid) with open('/Users/tunder/Dropbox/GenreProject/metadata/dirtyficids1899.txt', mode = 'w', encoding = 'utf-8') as f: for anid in newids: f.write(anid + '\n')
if thisreader not in readerowners[f]: readerowners[f].append(thisreader) paths[f].append(thispath) print(len(tagset)) allfiles = tagset # This is a list of all the filenames (note, filenames not docids) # that we found in the /readers sourcedir. train1 = pd.read_csv('bzipmeta.csv', dtype = 'object', index_col = 'docid') tidx = set(train1.index.values) for filename in allfiles: docid = filename.replace('.csv', '') if utils.dirty_pairtree(docid) not in tidx: print(docid) genrestocheck = ['fic', 'poe', 'dra', 'bio', 'non'] equivalences = {'non', 'bio', 'other'} volumesingenre = dict() for g in genrestocheck: volumesingenre[g] = [] alldocids = set() errorconditions = dict() erroramounts = dict() errorids = [] percentagesbydoc = dict()
docidstoget = set() with open(metafile, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: tagset = utils.get_tagset(row['genretags']) if 'drop' in tagset: continue getthis = False for tag in tagstoget: if tag in tagset: getthis = True if getthis: docidstoget.add(row['docid']) filespresent = os.listdir('/Users/tunder/Dropbox/fiction/data/') docidspresent = set([ x.replace('.fic.tsv', '') for x in filespresent if x.endswith('.fic.tsv') ]) docidsneeded = docidstoget - docidspresent outfile = '/Users/tunder/Dropbox/fiction/meta/filestoget' + str( datetime.date.today()) + '.txt' with open(outfile, mode='w', encoding='utf-8') as f: for docid in docidsneeded: outid = utils.dirty_pairtree(docid) f.write(outid + '\n')
import SonicScrewdriver as utils with open('/Users/tunder/Dropbox/GenreProject/metadata/getficids1899.txt', encoding='utf-8') as f: ids = [x.rstrip() for x in f.readlines()] newids = list() for anid in ids: newid = utils.dirty_pairtree(anid) newids.append(newid) with open('/Users/tunder/Dropbox/GenreProject/metadata/dirtyficids1899.txt', mode='w', encoding='utf-8') as f: for anid in newids: f.write(anid + '\n')
# organize_anovaset.py import SonicScrewdriver as utils import csv rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv') with open('anovaset.tsv', encoding = 'utf-8') as f: filelines = f.readlines() anovaset = list() for line in filelines: fields = line.split('\t') htid = utils.dirty_pairtree(fields[0]) category = fields[1] if category == 'elite': category = 'reviewed' elif category == 'vulgar': category = 'random' if htid in rows: author = table['author'][htid] title = table['title'][htid] date = utils.simple_date(htid, table) imprint = table['imprint'][htid] enumcron = table['enumcron'][htid] anovaset.append([htid, category, date, enumcron, author, title, imprint]) with open('anovaset.csv', mode='w', encoding = 'utf-8') as f: writer = csv.writer(f)
import SonicScrewdriver as utils metapath = '/Users/tunder/Dropbox/GenreProject/metadata/richpoemeta1859.tsv' with open(metapath, encoding = 'utf-8') as f: filelines = f.readlines() getpoe = [x.split('\t')[0] for x in filelines] outpath = '/Users/tunder/Dropbox/GenreProject/python/reception/getpoe.txt' with open(outpath, mode = 'w', encoding = 'utf-8') as f: for htid in getpoe: htid = utils.dirty_pairtree(htid) f.write(htid + '\n')
# getidstoadd import SonicScrewdriver as utils import os with open( '/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding='utf-8') as f: filelines = f.readlines() ids2get = [x.split('\t')[0] for x in filelines] fileswehave = os.listdir( '/Users/tunder/Dropbox/GenreProject/python/granger/elite/') idswehave = set( [x.replace('.poe.tsv', '') for x in fileswehave if x.endswith('.poe.tsv')]) with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode='w', encoding='utf-8') as f: for anid in ids2get: if anid not in idswehave and utils.clean_pairtree( anid) not in idswehave: f.write(utils.dirty_pairtree(anid) + '\n')
with open(metafile, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: tagset = utils.get_tagset(row['genretags']) if 'drop' in tagset: continue getthis = False for tag in tagstoget: if tag in tagset: getthis = True if getthis: docidstoget.add(row['docid']) filespresent = os.listdir('/Users/tunder/Dropbox/fiction/data/') docidspresent = set([x.replace('.fic.tsv', '') for x in filespresent if x.endswith('.fic.tsv')]) docidsneeded = docidstoget - docidspresent outfile = '/Users/tunder/Dropbox/fiction/meta/filestoget' + str(datetime.date.today()) + '.txt' with open(outfile, mode = 'w', encoding = 'utf-8') as f: for docid in docidsneeded: outid = utils.dirty_pairtree(docid) f.write(outid + '\n')
prefix = parts[0] pages = [] for i in range(0, maxpage + 1): filename = prefix + '/' + str(i) + '.txt' thispage = tar.extractfile(filename) page = [x.decode('utf-8') for x in thispage.readlines()] pages.append(page) pagelist, removed = header.remove_headers(pages, romannumerals) outpath = outfolder + afile outpath = outpath.replace('.tar', '.txt') with open(outpath, mode='w', encoding='utf-8') as f: for idx, page in enumerate(pagelist): f.write('\n<#PG# ' + str(idx) + '>\n') for line in page: f.write(line) theid = utils.dirty_pairtree(afile.replace('.tar', '')) alltheids.append(theid) # with open(outfolder + 'nonfictionmetadata.csv', mode = 'w', encoding = 'utf-8') as f: # writer = csv.DictWriter(f, fieldnames = fieldnames) # writer.writeheader() # for anid in alltheids: # row = metadict[anid] # writer.writerow(row)