Ejemplo n.º 1
0
import csv
from collections import Counter
import SonicScrewdriver as utils

ficids = set()

meta = dict()

ficsource = "/Volumes/TARDIS/work/fiction/metadata/fiction_metadata.csv"
with open(ficsource, encoding="utf-8") as f:
    reader = csv.DictReader(f)
    fieldnames = reader.fieldnames
    for row in reader:
        htid = row["htid"]
        dirtyhtid = utils.dirty_pairtree(htid)
        ficids.add(dirtyhtid)
        meta[dirtyhtid] = row

metasource = "/Volumes/TARDIS/work/metadata/MergedMonographs.tsv"

mysterysubjects = Counter()
scifisubjects = Counter()
gothsubjects = Counter()
gothclues = ["ghost stories", "gothic revival", "horror"]
genretags = dict()


def add_tag(genretags, htid, tagtoadd):
    if htid not in genretags:
        genretags[htid] = set()
Ejemplo n.º 2
0
# getidstoadd

import SonicScrewdriver as utils
import os

with open('/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding = 'utf-8') as f:
    filelines = f.readlines()

ids2get = [x.split('\t')[0] for x in filelines]

fileswehave = os.listdir('/Users/tunder/Dropbox/GenreProject/python/granger/elite/')
idswehave = set([x.replace('.poe.tsv','') for x in fileswehave if x.endswith('.poe.tsv')])

with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode = 'w', encoding = 'utf-8') as f:
    for anid in ids2get:
        if anid not in idswehave and utils.clean_pairtree(anid) not in idswehave:
            f.write(utils.dirty_pairtree(anid) + '\n')
Ejemplo n.º 3
0
                if thisreader not in readerowners[f]:
                    readerowners[f].append(thisreader)
                    paths[f].append(thispath)

print(len(tagset))

allfiles = tagset
# This is a list of all the filenames (note, filenames not docids)
# that we found in the /readers sourcedir.

train1 = pd.read_csv('../bzipmeta.csv', dtype = 'object', index_col = 'docid')

tidx = set(train1.index.values)
for filename in allfiles:
    docid = filename.replace('.csv', '')
    if utils.dirty_pairtree(docid) not in tidx:
        print(docid)

genrestocheck = ['fic', 'poe']
equivalences = {'non', 'bio', 'other'}

volumesingenre = dict()
for g in genrestocheck:
    volumesingenre[g] = []

alldocids = set()

for filename, owners in readerowners.items():
    path = paths[filename][0]
    if 'metadat' in filename:
        print(filename)
Ejemplo n.º 4
0
import SonicScrewdriver as utils

with open('/Users/tunder/Dropbox/GenreProject/metadata/getficids1899.txt', encoding = 'utf-8') as f:
    ids = [x.rstrip() for x in f.readlines()]

newids = list()
for anid in ids:
    newid = utils.dirty_pairtree(anid)
    newids.append(newid)

with open('/Users/tunder/Dropbox/GenreProject/metadata/dirtyficids1899.txt', mode = 'w', encoding = 'utf-8') as f:
    for anid in newids:
        f.write(anid + '\n')



Ejemplo n.º 5
0
                if thisreader not in readerowners[f]:
                    readerowners[f].append(thisreader)
                    paths[f].append(thispath)

print(len(tagset))

allfiles = tagset
# This is a list of all the filenames (note, filenames not docids)
# that we found in the /readers sourcedir.

train1 = pd.read_csv('bzipmeta.csv', dtype = 'object', index_col = 'docid')

tidx = set(train1.index.values)
for filename in allfiles:
    docid = filename.replace('.csv', '')
    if utils.dirty_pairtree(docid) not in tidx:
        print(docid)

genrestocheck = ['fic', 'poe', 'dra', 'bio', 'non']
equivalences = {'non', 'bio', 'other'}

volumesingenre = dict()
for g in genrestocheck:
    volumesingenre[g] = []
alldocids = set()

errorconditions = dict()
erroramounts = dict()
errorids = []
percentagesbydoc = dict()
Ejemplo n.º 6
0
docidstoget = set()

with open(metafile, encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        tagset = utils.get_tagset(row['genretags'])
        if 'drop' in tagset:
            continue
        getthis = False
        for tag in tagstoget:
            if tag in tagset:
                getthis = True

        if getthis:
            docidstoget.add(row['docid'])

filespresent = os.listdir('/Users/tunder/Dropbox/fiction/data/')

docidspresent = set([
    x.replace('.fic.tsv', '') for x in filespresent if x.endswith('.fic.tsv')
])

docidsneeded = docidstoget - docidspresent

outfile = '/Users/tunder/Dropbox/fiction/meta/filestoget' + str(
    datetime.date.today()) + '.txt'
with open(outfile, mode='w', encoding='utf-8') as f:
    for docid in docidsneeded:
        outid = utils.dirty_pairtree(docid)
        f.write(outid + '\n')
Ejemplo n.º 7
0
import SonicScrewdriver as utils

with open('/Users/tunder/Dropbox/GenreProject/metadata/getficids1899.txt',
          encoding='utf-8') as f:
    ids = [x.rstrip() for x in f.readlines()]

newids = list()
for anid in ids:
    newid = utils.dirty_pairtree(anid)
    newids.append(newid)

with open('/Users/tunder/Dropbox/GenreProject/metadata/dirtyficids1899.txt',
          mode='w',
          encoding='utf-8') as f:
    for anid in newids:
        f.write(anid + '\n')
Ejemplo n.º 8
0
# organize_anovaset.py

import SonicScrewdriver as utils
import csv

rows, columns, table = utils.readtsv('/Volumes/TARDIS/work/metadata/MergedMonographs.tsv')

with open('anovaset.tsv', encoding = 'utf-8') as f:
    filelines = f.readlines()

anovaset = list()

for line in filelines:
    fields = line.split('\t')
    htid = utils.dirty_pairtree(fields[0])
    category = fields[1]
    if category == 'elite':
        category = 'reviewed'
    elif category == 'vulgar':
        category = 'random'

    if htid in rows:
        author = table['author'][htid]
        title = table['title'][htid]
        date = utils.simple_date(htid, table)
        imprint = table['imprint'][htid]
        enumcron = table['enumcron'][htid]
        anovaset.append([htid, category, date, enumcron, author, title, imprint])

with open('anovaset.csv', mode='w', encoding = 'utf-8') as f:
    writer = csv.writer(f)
Ejemplo n.º 9
0
import SonicScrewdriver as utils

metapath = '/Users/tunder/Dropbox/GenreProject/metadata/richpoemeta1859.tsv'
with open(metapath, encoding = 'utf-8') as f:
    filelines = f.readlines()

getpoe = [x.split('\t')[0] for x in filelines]

outpath = '/Users/tunder/Dropbox/GenreProject/python/reception/getpoe.txt'
with open(outpath, mode = 'w', encoding = 'utf-8') as f:
    for htid in getpoe:
        htid = utils.dirty_pairtree(htid)
        f.write(htid + '\n')
Ejemplo n.º 10
0
# getidstoadd

import SonicScrewdriver as utils
import os

with open(
        '/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv',
        encoding='utf-8') as f:
    filelines = f.readlines()

ids2get = [x.split('\t')[0] for x in filelines]

fileswehave = os.listdir(
    '/Users/tunder/Dropbox/GenreProject/python/granger/elite/')
idswehave = set(
    [x.replace('.poe.tsv', '') for x in fileswehave if x.endswith('.poe.tsv')])

with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv',
          mode='w',
          encoding='utf-8') as f:
    for anid in ids2get:
        if anid not in idswehave and utils.clean_pairtree(
                anid) not in idswehave:
            f.write(utils.dirty_pairtree(anid) + '\n')
Ejemplo n.º 11
0
with open(metafile, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        tagset = utils.get_tagset(row['genretags'])
        if 'drop' in tagset:
            continue
        getthis = False
        for tag in tagstoget:
            if tag in tagset:
                getthis = True

        if getthis:
            docidstoget.add(row['docid'])


filespresent = os.listdir('/Users/tunder/Dropbox/fiction/data/')

docidspresent = set([x.replace('.fic.tsv', '') for x in filespresent if x.endswith('.fic.tsv')])

docidsneeded = docidstoget - docidspresent

outfile = '/Users/tunder/Dropbox/fiction/meta/filestoget' + str(datetime.date.today()) + '.txt'
with open(outfile, mode = 'w', encoding = 'utf-8') as f:
    for docid in docidsneeded:
        outid = utils.dirty_pairtree(docid)
        f.write(outid + '\n')




Ejemplo n.º 12
0
    prefix = parts[0]

    pages = []

    for i in range(0, maxpage + 1):
        filename = prefix + '/' + str(i) + '.txt'
        thispage = tar.extractfile(filename)
        page = [x.decode('utf-8') for x in thispage.readlines()]
        pages.append(page)

    pagelist, removed = header.remove_headers(pages, romannumerals)

    outpath = outfolder + afile
    outpath = outpath.replace('.tar', '.txt')
    with open(outpath, mode='w', encoding='utf-8') as f:
        for idx, page in enumerate(pagelist):
            f.write('\n<#PG# ' + str(idx) + '>\n')
            for line in page:
                f.write(line)

    theid = utils.dirty_pairtree(afile.replace('.tar', ''))
    alltheids.append(theid)

# with open(outfolder + 'nonfictionmetadata.csv', mode = 'w', encoding = 'utf-8') as f:
#     writer = csv.DictWriter(f, fieldnames = fieldnames)
#     writer.writeheader()
#     for anid in alltheids:
#         row = metadict[anid]
#         writer.writerow(row)