def get_classvector(classpath, volumeIDs):
	with open(classpath, encoding = 'utf-8') as f:
		filelines = f.readlines()
	classdict = dict()
	for line in filelines:
		line = line.rstrip()
		fields = line.split('\t')
		volid = utils.clean_pairtree(fields[0])
		theclass = fields[1]
		if theclass == 'elite':
			intclass = 1
		elif theclass == 'vulgar':
			intclass = 0
		else:
			intclass = int(theclass)
		classdict[volid] = intclass

	if len(volumeIDs) < 1:
		volumeIDs = [x for x in classdict.keys()]

	classvector = np.zeros(len(volumeIDs))
	for idx, anid in enumerate(volumeIDs):
		if anid in classdict:
			classvector[idx] = classdict[anid]
		else:
			print('Missing from class metadata: ' + anid)

	return classvector, volumeIDs
def get_classvector(classpath, volumeIDs):
    with open(classpath, encoding='utf-8') as f:
        filelines = f.readlines()
    classdict = dict()
    for line in filelines:
        line = line.rstrip()
        fields = line.split('\t')
        volid = utils.clean_pairtree(fields[0])
        theclass = fields[1]
        if theclass == 'elite':
            intclass = 1
        elif theclass == 'vulgar':
            intclass = 0
        else:
            intclass = int(theclass)
        classdict[volid] = intclass

    if len(volumeIDs) < 1:
        volumeIDs = [x for x in classdict.keys()]

    classvector = np.zeros(len(volumeIDs))
    for idx, anid in enumerate(volumeIDs):
        if anid in classdict:
            classvector[idx] = classdict[anid]
        else:
            print('Missing from class metadata: ' + anid)

    return classvector, volumeIDs
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
    ''' Returns a pandas dataframe with feature counts for all the volumes
    to be used in this model.
    '''

    df = dict()
    # We initially construct the data frame as a dictionary of Series.
    vocabset = set(vocabulary)
    allIDs = positiveIDs + negativeIDs

    for v in vocabulary:
        df[v] = pd.Series(np.zeros(len(allIDs)), index = allIDs)

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                feature = row['feature']

                if feature.startswith('#header'):
                    feature = feature.replace('#header', '')

                if feature in vocabset:
                    df[feature].loc[docid] = row['count']

    # Now let's refashion the dictionary as an actual dataframe.
    df = pd.DataFrame(df, index = allIDs)
    df = df[vocabulary]
    # This reorders the columns to be in vocab order

    stdscaler = StandardScaler()
    scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index = allIDs)

    return scaleddf
def add_to_ficgenre(docid, existingfile, tagas):
    global outfieldnames, metadata
    with open(existingfile, mode = 'a', encoding = 'utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = outfieldnames)
        o = dict()
        j = metadata[docid]
        fields = [j['HTid'], str(j['date']), j['author'], j['title'], j['enumcron']]
        print(" | ".join(fields))
        o['docid'] = utils.clean_pairtree(j['HTid'])
        o['recordid'] = j['recordid']
        o['oclc'] = j['OCLC']
        o['locnum'] = j['LOCnum']
        o['author'] = j['author']
        o['imprint'] = j['imprint']
        o['date'] = j['date']
        o['firstpub'] = input('First publication date? ')
        o['birthdate'] = input('Author birth year? ')
        o['nationality'] = input('Nationality? ')
        o['gender'] = input('Gender? ')
        o['title'] = j['title']
        o['subjects'] = j['subjects']
        o['enumcron'] = j['enumcron']
        o['genretags'] = tagas
        for key, value in o.items():
            if o[key] == '<blank>':
                o[key] = ''
        writer.writerow(o)
    print('Done.')
Exemple #5
0
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir):
    ''' Returns a pandas dataframe with feature counts for all the volumes
    to be used in this model.
    '''

    df = dict()
    # We initially construct the data frame as a dictionary of Series.
    vocabset = set(vocabulary)
    allIDs = positiveIDs + negativeIDs

    for v in vocabulary:
        df[v] = pd.Series(np.zeros(len(allIDs)), index=allIDs)

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                feature = row['feature']

                if feature.startswith('#header'):
                    feature = feature.replace('#header', '')

                if feature in vocabset:
                    df[feature].loc[docid] = row['count']

    # Now let's refashion the dictionary as an actual dataframe.
    df = pd.DataFrame(df, index=allIDs)
    df = df[vocabulary]
    # This reorders the columns to be in vocab order

    stdscaler = StandardScaler()
    scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index=allIDs)

    return scaleddf
Exemple #6
0
def get_vocabulary_and_counts_4pages(metadata, allIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.

    Adjusted to handle page instances.
    '''

    doc_freq = Counter()
    counts = dict()
    id2group = dict()

    for docid in allIDs:

        path = os.path.join(sourcedir,
                            utils.clean_pairtree(docid) + '.basic.json.bz2')
        volume = parser.PagelistFromJson(path, docid)
        pagecounts = volume.get_feature_list()

        for idx, page in enumerate(pagecounts):
            pageid = docid + '||' + str(idx)

            id2group[pageid] = docid

            counts[pageid] = page
            for key, value in page.items():
                doc_freq[key] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts, id2group
Exemple #7
0
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir,
                              n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()
    counts = dict()

    for docid in allIDs:
        counts[docid] = Counter()
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                word = row['feature']
                if len(word) < 1:
                    continue

                ct = float(row['count'])

                if word.startswith('#header'):
                    word = word.replace('#header', '')
                #
                # This debatable choice treats header words as equivalent
                # to occurrences in the body text. In practice, this seems
                # to slightly improve performance, at least when you're using
                # SVMs and relatively low numbers of features (140-300).
                # Otherwise header words are in practice just discarded, because
                # e.g. #headeract won't be one of the top 250 words.

                doc_freq[word] += 1
                counts[docid][word] += ct

                # # experimental
                # if word.startswith('#'):
                #     squaredfeature = word + 'sqrd'
                #     counts[docid][word] = ct * ct

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also
    returns a dictionary of wordcounts so we don't have to read them again from the
    file when generating a feature dataframe.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()
    counts = dict()

    for docid in allIDs:
        counts[docid] = Counter()
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                word = row['feature']
                if len(word) < 1:
                    continue

                ct = float(row['count'])

                if word.startswith('#header'):
                    word = word.replace('#header', '')
                #
                # This debatable choice treats header words as equivalent
                # to occurrences in the body text. In practice, this seems
                # to slightly improve performance, at least when you're using
                # SVMs and relatively low numbers of features (140-300).
                # Otherwise header words are in practice just discarded, because
                # e.g. #headeract won't be one of the top 250 words.

                doc_freq[word] += 1
                counts[docid][word] += ct

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab, counts
Exemple #9
0
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:

                word = row['feature']
                if word.startswith('#header'):
                    word = word.replace('#header', '')

                doc_freq[word] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab
Exemple #10
0
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n):
    ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs.
    '''

    allIDs = positiveIDs + negativeIDs

    doc_freq = Counter()

    for docid in allIDs:
        path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
        with open(path, encoding = 'utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:

                word = row['feature']
                if word.startswith('#header'):
                    word = word.replace('#header', '')

                doc_freq[word] += 1

    vocab = [x[0] for x in doc_freq.most_common(n)]
    print('Vocabulary constructed.')

    return vocab
        nonusa += 1

    if gender == "f":
        female += 1
    elif gender == "m":
        male += 1


bydate = dict()
fictionmetadata = dict()
datesbydocid = dict()

with open("/Users/tunder/work/genre/metadata/ficmeta.csv", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        docid = utils.clean_pairtree(row["htid"])
        fictionmetadata[docid] = row

        date = utils.date_row(row)
        datesbydocid[docid] = date
        if docid in selected:
            continue
        if date in bydate:
            bydate[date].append(docid)
        else:
            bydate[date] = [docid]

controlset = set()
controlmeta = dict()
usedfromselected = list()
outmeta = outmeta.assign(author = outmeta.author.map(lower_and_trim))

outmeta.to_csv('mallet80metadata4experiment.tsv', sep = '\t', index_label = 'docid')

# make paths to these volumes

import SonicScrewdriver as utils
import os

missing = set()
idmapper = dict()

for anid in allselected:

    path, postfix = utils.pairtreepath(anid, '/Volumes/TARDIS/work/ef/fic/')
    totalpath = path + postfix + '/' + utils.clean_pairtree(anid) + '.json.bz2'

    if not os.path.isfile(totalpath):
        if '$' in anid:
            newid = anid.replace('uc1.b', 'uc1.$b')
        else:
            newid = anid.replace('uc1.$b', 'uc1.b')

        path, postfix = utils.pairtreepath(newid, '/Volumes/TARDIS/work/ef/fic/')
        totalpath = path + postfix + '/' + utils.clean_pairtree(newid) + '.json.bz2'
        if os.path.isfile(totalpath):
            idmapper[anid] = totalpath
        else:
            missing.add(anid)
    else:
        idmapper[anid] = totalpath
Exemple #13
0
            for key, value in self.totalcounts.items():
                writer.writerow([key, value / self.totaltokens])
            writer.writerow(['#sentencelength', self.sentencelength])
            writer.writerow(['#typetoken', self.typetoken])
            writer.writerow(['#linelength', self.linelength])

    def get_volume_features(self):
        '''
        Just like write_volume_features, except we return them
        as a dictionary.
        '''

        if self.totaltokens < 1:
            return Counter(), 0

        else:

            return self.totalcounts, self.totaltokens

if __name__ == "__main__":

    meta = pd.read_csv('/Users/tunder/Dropbox/python/train20/bzipmeta.csv', dtype = 'object', index_col = 'docid')
    for index, row in meta.iterrows():
        inpath = row['filepath']
        vol = VolumeFromJson(inpath, index)
        outpath = '/Volumes/TARDIS/work/train20/' + utils.clean_pairtree(index) + '.csv'
        vol.write_volume_features(outpath, override = True)



Exemple #14
0
def get_meta():
    meta = dict()
    meta['aubirth'] = input('Authors year of birth? ')
    meta['augender'] = input('Authors gender? ')
    meta['national origin'] = input('Authors nationality? ')
    meta['firstpub'] = input('Date of first publication? ')
    return meta


reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1820-1839.csv'
with open(reviews, encoding='utf-8') as f:
    reader = csv.DictReader(f)

    for row in reader:

        htid = utils.clean_pairtree(row['HTid'])
        pubdate = int(row['date'][0:4])
        firstpub = int(row['firstpub'])
        yrrev = int(row['yrrev'])

        if pubdate > yrrev + 5:
            date = yrrev
            print(str(pubdate) + " => " + str(yrrev))
        else:
            date = pubdate

        jgenre = row['Jgenre']

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.append(htid)
pathdict = dict()

with open('../noveltmmeta/get_EF/ids2pathlist.tsv', encoding='utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        pathdict[fields[0]] = fields[1]

meta = pd.read_csv('topicsample.tsv', sep='\t')

outrows = []
missing = 0
themissing = []

for d in meta.docid:
    cleand = utils.clean_pairtree(d)
    dollarless = d.replace('$', '')

    if d in pathdict:
        outrows.append((d, pathdict[d]))
    elif cleand in pathdict:
        outrows.append((cleand, pathdict[cleand]))
    elif dollarless in pathdict:
        outrows.append((dollarless, pathdict[dollarless]))
    else:
        missing += 1
        themissing.append(d)

with open('pathlist.tsv', mode='w', encoding='utf-8') as f:
    f.write('docid\tpath\n')
    for docid, path in outrows:
import csv
import SonicScrewdriver as utils
import random
import os.path

selecteddates = dict()
selected = list()

reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv'
with open(reviews, encoding='utf-8') as f:
    reader = csv.DictReader(f)

    for row in reader:

        htid = utils.clean_pairtree(row['HTid'])
        pubdate = int(row['date'])
        firstpub = int(row['firstpub'])
        yrrev = int(row['yrrev'])

        if pubdate > yrrev + 5:
            date = yrrev
            print(str(pubdate) + " => " + str(yrrev))
        else:
            date = pubdate

        jgenre = row['Jgenre']

        if jgenre == 'fic':
            selecteddates[htid] = date
            selected.append(htid)
Exemple #17
0
def main(sourcedirs, metapath, modeldir, outpath, pairtree=False):
    '''
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.
    '''

    # We're going to store all the models, by name, in a dictionary:

    models = []

    modelpaths = glob.glob(modeldir + '*.p')
    assert len(modelpaths) == 1
    model = loadamodel(modelpaths[0])

    # Now get metadata.

    metadata = get_metadata(metapath)

    notfound = dict()

    c = 0
    path = ''

    for docid in metadata.index:
        print(c)
        c += 1

        if pairtree:
            found = False
            for sourcedir in sourcedirs:
                path = get_pairtree(sourcedir, docid)
                if os.path.isfile(path):
                    found = True
                    chosenpath = path
            if not found:
                print(path)
                print('file not found')
                error = 'file not found'
                wordcount = 0
            else:
                pagecounts, pageids, error = get_counts_4pages(
                    chosenpath, docid)

        else:
            path = os.path.join(sourcedir,
                                utils.clean_pairtree(docid) + '.csv')
            pagecounts, pageids, error = pagecounts4file(path)

        if error == 'success':
            volumejson = predict_volume(model, pageids, pagecounts, docid)
            volumestring = json.dumps(volumejson)
            with open(outpath, mode='a', encoding='utf-8') as f:
                f.write(volumestring + '\n')
            print(docid)
        else:
            notfound[docid] = error
            print(docid, error)

    with open('fictionpagesnotfound.txt', mode='a', encoding='utf-8') as f:
        for vol, reason in notfound.items():
            f.write(vol + '\t' + reason + '\n')
Exemple #18
0
                           index_col='docid',
                           sep='\t')
        ctr = 0

        for index, row in meta.iterrows():
            ctr += 1
            if ctr % 100 == 1:
                print(ctr)

            inpath = rootpath + row['path']
            if not os.path.isfile(inpath):
                missing += 1
                print('missing')
            else:
                vol = VolumeFromJson(inpath, index)
                outpath = outfolder + utils.clean_pairtree(index) + '.txt'
                metarow = vol.write_volume(outpath,
                                           folder=outfolder,
                                           override=True,
                                           translator=translator,
                                           use_headers=False,
                                           skip_front=.1,
                                           skip_back=0.05)
                all_outrows.append(metarow)

        print(missing)

        columns = [
            'docid', 'htid', 'totaltokens', 'skipped_pages', 'trimmed_pages',
            'path'
        ]
        print('Done\a')

    elif args[1] == 'wholevolume':
        missing = 0
        path_to_meta = args[2]

        meta = pd.read_csv(path_to_meta,
                           dtype='object',
                           index_col='docid',
                           sep='\t')

        for index, row in meta.iterrows():
            inpath = rootpath + row['path']
            try:
                vol = VolumeFromJson(inpath, index)
                outpath = '../data/' + utils.clean_pairtree(index) + '.tsv'
                vol.write_volume_features([outpath],
                                          folder='../data/',
                                          override=True,
                                          translator=translator,
                                          use_headers=False,
                                          skip_front=0,
                                          skip_back=0)
            except:
                missing += 1

        print(missing)

    elif args[1] == 'divided':

        path_to_meta = args[2]
def main(sourcedir, metapath, modeldir, outpath, pairtree = False):
    '''
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.
    '''

    global allnames, top1000words

    alternatesource = '/projects/ichass/usesofscale/post23/englishmonographs1980-2016/'

    # We're going to store all the models, by name, in a dictionary:

    models = dict()

    modelpaths = glob.glob(modeldir + '*.p')

    for apath in modelpaths:
        name = apath.replace(modeldir, '')
        name = name.replace('.p', '')
        models[name] = loadamodel(apath)

    # Now get metadata.

    metadata = get_metadata(metapath)

    nonficprobs = []
    juvieprobs = []
    wordcounts = []

    c = 0
    for docid in metadata.index:
        print(c)
        c += 1

        if pairtree:
            path1 = get_pairtree(sourcedir, docid)
            path2 = get_pairtree(alternatesource, docid)

            if os.path.isfile(path1):
                chosenpath = path1
            elif os.path.isfile(path2):
                chosenpath = path2
            else:
                print(path1)
                print(path2)
                print('file not found')
                error = 'file not found'
                wordcount = 0

            counts, error, wordcount = counts4json(chosenpath, docid)

        else:
            path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
            counts, error, wordcount = counts4file(path)

        if error == 'success':
            nonficprob, juvenileprob = volume_classification(models, counts)
        else:
            nonficprob = 0.5
            juvenileprob = 0.5

        nonficprobs.append(nonficprob)
        juvieprobs.append(juvenileprob)
        wordcounts.append(wordcount)


    metadata.loc[ : , 'nonficprob'] = pd.Series(nonficprobs, index = metadata.index)
    metadata.loc[ : , 'juvenileprob'] = pd.Series(juvieprobs, index = metadata.index)
    metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index)

    metadata.to_csv(outpath)
    metasource = pd.read_csv(args[1], sep='\t')

    missing = 0

    docstoprocess = metasource.docid

    for idx, docid in enumerate(docstoprocess):

        if idx % 100 == 1:
            print(idx)

        if docid in translations:
            docid = translations[docid]

        path, postfix = utils.pairtreepath(docid, '')
        inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
            docid) + '.json.bz2'

        if os.path.isfile(inpath):
            pass
        elif 'uc1.b' in docid:
            newdoc = docid.replace('uc1.b', 'uc1.$b')
            path, postfix = utils.pairtreepath(newdoc, '')
            inpath = rootpath + path + postfix + '/' + utils.clean_pairtree(
                newdoc) + '.json.bz2'
            if os.path.isfile(inpath):
                translations[docid] = newdoc
            else:
                missing += 1
                print(missing, inpath, 'not found.')
        else:
            missing += 1
Exemple #22
0
    fieldnames = reader.fieldnames

    for row in reader:
        inferred = forceint(row['inferreddate'])
        firstpub = forceint(row['firstpub'])
        if inferred < firstpub:
            print(row['author'])
            print(row['docid'])
            print('inferred: ' + str(inferred))
            print('firstpub: ' + str(firstpub))
            date = int(input('Date of first publication: '))
            row['firstpub'] = str(date)
        if row['docid'] in existing:
            print('existing ' + row['docid'])
        existing.add(row['docid'])
        row['docid'] = utils.clean_pairtree(row['docid'])
        allrows.append(row)
        authornames.add(row['author'])

authornames = list(authornames)

synonyms = dict()

for name in authornames:
    for anothername in authornames:
        if name == anothername:
            continue

        if name in synonyms:
            if synonyms[name] == anothername:
                continue
def get_meta():
    meta = dict()
    meta['aubirth'] = input('Authors year of birth? ')
    meta['augender'] = input ('Authors gender? ')
    meta['national origin'] = input('Authors nationality? ')
    meta['firstpub'] = input('Date of first publication? ')
    return meta

reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1860-1879_200.csv'
with open(reviews, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)

    for row in reader:

        htid = utils.clean_pairtree(row['HTid'])
        pubdate = int(row['date'][0:4])
        firstpub = int(row['firstpub'])
        yrrev = int(row['yrrev'])

        if pubdate > yrrev + 5:
            date = yrrev
            print(str(pubdate) + " => " + str(yrrev))
        else:
            date = pubdate

        jgenre = row['Jgenre']

        if jgenre == 'fic':
            selecteddates[htid] = date
            selected.append(htid)
    '/Volumes/TARDIS/work/fullmeta/ic_monographs.tsv'
]

genrecats = [
    'suspense', 'adventure', 'western', 'mystery', 'detective',
    'science fiction', 'fantasy', 'horror', 'gothic', 'romance', 'pulp'
]

doublets = []

for afile in tocheck:
    with open(afile, encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            docid = row['docid']
            alternative = utils.clean_pairtree(docid)
            if docid in docs2categorize:
                dothis = True
                d = docid
            elif alternative in docs2categorize:
                dothis = True
                d = alternative
            else:
                dothis = False

            if dothis:
                g = row['genres'].lower() + " " + row['subjects'].lower()
                genre = 'none'
                for cat in genrecats:
                    if cat in g:
                        genre = 'genre'
    fieldnames = reader.fieldnames

    for row in reader:
        inferred = forceint(row['inferreddate'])
        firstpub = forceint(row['firstpub'])
        if inferred < firstpub:
            print(row['author'])
            print(row['docid'])
            print('inferred: ' + str(inferred))
            print('firstpub: ' + str(firstpub))
            date = int(input('Date of first publication: '))
            row['firstpub'] = str(date)
        if row['docid'] in existing:
            print('existing ' + row['docid'])
        existing.add(row['docid'])
        row['docid'] = utils.clean_pairtree(row['docid'])
        allrows.append(row)
        authornames.add(row['author'])

authornames = list(authornames)

synonyms = dict()

for name in authornames:
    for anothername in authornames:
        if name == anothername:
            continue

        if name in synonyms:
            if synonyms[name] == anothername:
                continue
# getidstoadd

import SonicScrewdriver as utils
import os

with open('/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding = 'utf-8') as f:
    filelines = f.readlines()

ids2get = [x.split('\t')[0] for x in filelines]

fileswehave = os.listdir('/Users/tunder/Dropbox/GenreProject/python/granger/elite/')
idswehave = set([x.replace('.poe.tsv','') for x in fileswehave if x.endswith('.poe.tsv')])

with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode = 'w', encoding = 'utf-8') as f:
    for anid in ids2get:
        if anid not in idswehave and utils.clean_pairtree(anid) not in idswehave:
            f.write(utils.dirty_pairtree(anid) + '\n')
Exemple #27
0
# getidstoadd

import SonicScrewdriver as utils
import os

with open(
        '/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv',
        encoding='utf-8') as f:
    filelines = f.readlines()

ids2get = [x.split('\t')[0] for x in filelines]

fileswehave = os.listdir(
    '/Users/tunder/Dropbox/GenreProject/python/granger/elite/')
idswehave = set(
    [x.replace('.poe.tsv', '') for x in fileswehave if x.endswith('.poe.tsv')])

with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv',
          mode='w',
          encoding='utf-8') as f:
    for anid in ids2get:
        if anid not in idswehave and utils.clean_pairtree(
                anid) not in idswehave:
            f.write(utils.dirty_pairtree(anid) + '\n')
# a newer metadata set.

import csv
import SonicScrewdriver as utils
import random

selecteddates = dict()
selected = list()

reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1840-1859_200.csv'
with open(reviews, encoding = 'utf-8') as f:
    reader = csv.DictReader(f)

    for row in reader:

        htid = utils.clean_pairtree(row['HTid'])
        pubdate = int(row['date'])
        firstpub = int(row['firstpub'])
        yrrev = int(row['yrrev'])

        if pubdate > yrrev + 5:
            date = yrrev
            print(str(pubdate) + " => " + str(yrrev))
        else:
            date = pubdate

        jgenre = row['Jgenre']

        if jgenre == 'poe':
            selecteddates[htid] = date
            selected.append(htid)
Exemple #29
0
def main(sourcedir, metapath, modeldir, outpath, pairtree=False):
    '''
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.
    '''

    global allnames, top1000words

    # We're going to store all the models, by name, in a dictionary:

    models = dict()

    for name in allnames:
        models[name] = loadamodel(modeldir + name)

    # Now get metadata.

    metadata = get_metadata(metapath)

    predictedgenres = []
    predictedprobs = []
    explanations = []
    wordcounts = []
    englishpcts = []

    c = 0
    for docid in metadata.index:
        print(c)
        c += 1

        if pairtree:
            path = get_pairtree(sourcedir, docid)
            counts, error, wordcount = counts4json(path, docid)
        else:
            path = os.path.join(sourcedir,
                                utils.clean_pairtree(docid) + '.csv')
            counts, error, wordcount = counts4file(path)

        if error == 'success':
            genredict = make_genredict(metadata, docid)
            englishpct = get_english_percent(counts, top1000words)
            genre, probability, explanation = volume_classification(
                models, counts, genredict)
        else:
            englishpct = 0
            genre = 'NA'
            probability = 0
            explanation = error

        predictedgenres.append(genre)
        predictedprobs.append(probability)
        explanations.append(explanation)
        wordcounts.append(wordcount)
        englishpcts.append(englishpct)

    metadata.loc[:, 'predictedgenre'] = pd.Series(predictedgenres,
                                                  index=metadata.index)
    metadata.loc[:, 'probability'] = pd.Series(predictedprobs,
                                               index=metadata.index)
    metadata.loc[:, 'wordcount'] = pd.Series(wordcounts, index=metadata.index)
    metadata.loc[:, 'englishpct'] = pd.Series(englishpcts,
                                              index=metadata.index)
    metadata.loc[:, 'explanation'] = pd.Series(explanations,
                                               index=metadata.index)

    metadata.to_csv(outpath)
def main(sourcedir, metapath, modeldir, outpath, pairtree = False):
    '''
    This function can be called from outside the module; it accepts
    path information and then iterates through all the files it
    finds in the metadata at "metapath."

    If the pairtree flag is True, we assume sourcedir is the root
    of a pairtree structure. Otherwise we assume it's a flat list.
    '''

    global allnames, top1000words

    # We're going to store all the models, by name, in a dictionary:

    models = dict()

    for name in allnames:
        models[name] = loadamodel(modeldir + name)

    # Now get metadata.

    metadata = get_metadata(metapath)

    predictedgenres = []
    predictedprobs = []
    explanations = []
    wordcounts = []
    englishpcts = []

    c = 0
    for docid in metadata.index:
        print(c)
        c += 1

        if pairtree:
            path = get_pairtree(sourcedir, docid)
            counts, error, wordcount = counts4json(path, docid)
        else:
            path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv')
            counts, error, wordcount = counts4file(path)

        if error == 'success':
            genredict = make_genredict(metadata, docid)
            englishpct = get_english_percent(counts, top1000words)
            genre, probability, explanation = volume_classification(models, counts, genredict)
        else:
            englishpct = 0
            genre = 'NA'
            probability = 0
            explanation = error

        predictedgenres.append(genre)
        predictedprobs.append(probability)
        explanations.append(explanation)
        wordcounts.append(wordcount)
        englishpcts.append(englishpct)

    metadata.loc[ : , 'predictedgenre'] = pd.Series(predictedgenres, index = metadata.index)
    metadata.loc[ : , 'probability'] = pd.Series(predictedprobs, index = metadata.index)
    metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index)
    metadata.loc[ : , 'englishpct'] = pd.Series(englishpcts, index = metadata.index)
    metadata.loc[ : , 'explanation'] = pd.Series(explanations, index = metadata.index)

    metadata.to_csv(outpath)
        aliases[row['alias']] = row['ourname']

# find matches

key2docid = dict()
volbackup = dict()

with open('../metadata/filtered_fiction_plus_18c.tsv', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for row in reader:

        try:
            intval = int(row['docid'])
            docid = row['docid']
        except:
            docid = utils.clean_pairtree(row['docid'])

        possibleauth = row['author']
        possibletitle = normalize_title(row['title'])
        found = False
        for key, value in genremeta.items():
            author, title, normauth = key
            if match(normauth, title, possibleauth, possibletitle):
                key2docid[key] = docid
                volbackup[key] = utils.clean_pairtree(row['volid'])
                found = True
                print('Found: ', possibleauth, author, possibletitle)
                break

print('Found a total of ', len(key2docid))
list_of_dataframes = []
idset = set()

list_of_files = args[1:]
root = '../rawdata/'
list_of_paths = [root + x for x in list_of_files]

for p in list_of_paths:
    df = pd.read_csv(p, index_col='docid')
    list_of_dataframes.append(df)
    idset = idset | set(df.index)

ids = []
for anid in idset:
    ids.append(utils.clean_pairtree(str(anid)))

allpaths = set()
with open('/Volumes/TARDIS/work/ef/htrc-ef-all-files.txt',
          encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        allpaths.add(line)

missing = set()
found = set()
mapping = dict()
path2id = dict()

#things we already have:
        nonusa += 1

    if gender == 'f':
        female += 1
    elif gender == 'm':
        male += 1

bydate = dict()
fictionmetadata = dict()
datesbydocid = dict()

with open('/Users/tunder/work/genre/metadata/ficmeta.csv',
          encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        docid = utils.clean_pairtree(row['htid'])
        fictionmetadata[docid] = row

        date = utils.date_row(row)
        datesbydocid[docid] = date
        if docid in selected:
            continue
        if date in bydate:
            bydate[date].append(docid)
        else:
            bydate[date] = [docid]

controlset = set()
controlmeta = dict()
usedfromselected = list()