def get_classvector(classpath, volumeIDs): with open(classpath, encoding = 'utf-8') as f: filelines = f.readlines() classdict = dict() for line in filelines: line = line.rstrip() fields = line.split('\t') volid = utils.clean_pairtree(fields[0]) theclass = fields[1] if theclass == 'elite': intclass = 1 elif theclass == 'vulgar': intclass = 0 else: intclass = int(theclass) classdict[volid] = intclass if len(volumeIDs) < 1: volumeIDs = [x for x in classdict.keys()] classvector = np.zeros(len(volumeIDs)) for idx, anid in enumerate(volumeIDs): if anid in classdict: classvector[idx] = classdict[anid] else: print('Missing from class metadata: ' + anid) return classvector, volumeIDs
def get_classvector(classpath, volumeIDs): with open(classpath, encoding='utf-8') as f: filelines = f.readlines() classdict = dict() for line in filelines: line = line.rstrip() fields = line.split('\t') volid = utils.clean_pairtree(fields[0]) theclass = fields[1] if theclass == 'elite': intclass = 1 elif theclass == 'vulgar': intclass = 0 else: intclass = int(theclass) classdict[volid] = intclass if len(volumeIDs) < 1: volumeIDs = [x for x in classdict.keys()] classvector = np.zeros(len(volumeIDs)) for idx, anid in enumerate(volumeIDs): if anid in classdict: classvector[idx] = classdict[anid] else: print('Missing from class metadata: ' + anid) return classvector, volumeIDs
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir): ''' Returns a pandas dataframe with feature counts for all the volumes to be used in this model. ''' df = dict() # We initially construct the data frame as a dictionary of Series. vocabset = set(vocabulary) allIDs = positiveIDs + negativeIDs for v in vocabulary: df[v] = pd.Series(np.zeros(len(allIDs)), index = allIDs) for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: feature = row['feature'] if feature.startswith('#header'): feature = feature.replace('#header', '') if feature in vocabset: df[feature].loc[docid] = row['count'] # Now let's refashion the dictionary as an actual dataframe. df = pd.DataFrame(df, index = allIDs) df = df[vocabulary] # This reorders the columns to be in vocab order stdscaler = StandardScaler() scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index = allIDs) return scaleddf
def add_to_ficgenre(docid, existingfile, tagas): global outfieldnames, metadata with open(existingfile, mode = 'a', encoding = 'utf-8') as f: writer = csv.DictWriter(f, fieldnames = outfieldnames) o = dict() j = metadata[docid] fields = [j['HTid'], str(j['date']), j['author'], j['title'], j['enumcron']] print(" | ".join(fields)) o['docid'] = utils.clean_pairtree(j['HTid']) o['recordid'] = j['recordid'] o['oclc'] = j['OCLC'] o['locnum'] = j['LOCnum'] o['author'] = j['author'] o['imprint'] = j['imprint'] o['date'] = j['date'] o['firstpub'] = input('First publication date? ') o['birthdate'] = input('Author birth year? ') o['nationality'] = input('Nationality? ') o['gender'] = input('Gender? ') o['title'] = j['title'] o['subjects'] = j['subjects'] o['enumcron'] = j['enumcron'] o['genretags'] = tagas for key, value in o.items(): if o[key] == '<blank>': o[key] = '' writer.writerow(o) print('Done.')
def get_featureframe(vocabulary, positiveIDs, negativeIDs, sourcedir): ''' Returns a pandas dataframe with feature counts for all the volumes to be used in this model. ''' df = dict() # We initially construct the data frame as a dictionary of Series. vocabset = set(vocabulary) allIDs = positiveIDs + negativeIDs for v in vocabulary: df[v] = pd.Series(np.zeros(len(allIDs)), index=allIDs) for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: feature = row['feature'] if feature.startswith('#header'): feature = feature.replace('#header', '') if feature in vocabset: df[feature].loc[docid] = row['count'] # Now let's refashion the dictionary as an actual dataframe. df = pd.DataFrame(df, index=allIDs) df = df[vocabulary] # This reorders the columns to be in vocab order stdscaler = StandardScaler() scaleddf = pd.DataFrame(stdscaler.fit_transform(df), index=allIDs) return scaleddf
def get_vocabulary_and_counts_4pages(metadata, allIDs, sourcedir, n): ''' Gets the top n words by docfrequency, but also returns a dictionary of wordcounts so we don't have to read them again from the file when generating a feature dataframe. Adjusted to handle page instances. ''' doc_freq = Counter() counts = dict() id2group = dict() for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.basic.json.bz2') volume = parser.PagelistFromJson(path, docid) pagecounts = volume.get_feature_list() for idx, page in enumerate(pagecounts): pageid = docid + '||' + str(idx) id2group[pageid] = docid counts[pageid] = page for key, value in page.items(): doc_freq[key] += 1 vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab, counts, id2group
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also returns a dictionary of wordcounts so we don't have to read them again from the file when generating a feature dataframe. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() counts = dict() for docid in allIDs: counts[docid] = Counter() path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if len(word) < 1: continue ct = float(row['count']) if word.startswith('#header'): word = word.replace('#header', '') # # This debatable choice treats header words as equivalent # to occurrences in the body text. In practice, this seems # to slightly improve performance, at least when you're using # SVMs and relatively low numbers of features (140-300). # Otherwise header words are in practice just discarded, because # e.g. #headeract won't be one of the top 250 words. doc_freq[word] += 1 counts[docid][word] += ct # # experimental # if word.startswith('#'): # squaredfeature = word + 'sqrd' # counts[docid][word] = ct * ct vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab, counts
def get_vocabulary_and_counts(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs, but also returns a dictionary of wordcounts so we don't have to read them again from the file when generating a feature dataframe. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() counts = dict() for docid in allIDs: counts[docid] = Counter() path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if len(word) < 1: continue ct = float(row['count']) if word.startswith('#header'): word = word.replace('#header', '') # # This debatable choice treats header words as equivalent # to occurrences in the body text. In practice, this seems # to slightly improve performance, at least when you're using # SVMs and relatively low numbers of features (140-300). # Otherwise header words are in practice just discarded, because # e.g. #headeract won't be one of the top 250 words. doc_freq[word] += 1 counts[docid][word] += ct vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab, counts
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if word.startswith('#header'): word = word.replace('#header', '') doc_freq[word] += 1 vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab
def get_vocabulary(metadata, positiveIDs, negativeIDs, sourcedir, n): ''' Gets the top n words by docfrequency in positiveIDs + negativeIDs. ''' allIDs = positiveIDs + negativeIDs doc_freq = Counter() for docid in allIDs: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') with open(path, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['feature'] if word.startswith('#header'): word = word.replace('#header', '') doc_freq[word] += 1 vocab = [x[0] for x in doc_freq.most_common(n)] print('Vocabulary constructed.') return vocab
nonusa += 1 if gender == "f": female += 1 elif gender == "m": male += 1 bydate = dict() fictionmetadata = dict() datesbydocid = dict() with open("/Users/tunder/work/genre/metadata/ficmeta.csv", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: docid = utils.clean_pairtree(row["htid"]) fictionmetadata[docid] = row date = utils.date_row(row) datesbydocid[docid] = date if docid in selected: continue if date in bydate: bydate[date].append(docid) else: bydate[date] = [docid] controlset = set() controlmeta = dict() usedfromselected = list()
outmeta = outmeta.assign(author = outmeta.author.map(lower_and_trim)) outmeta.to_csv('mallet80metadata4experiment.tsv', sep = '\t', index_label = 'docid') # make paths to these volumes import SonicScrewdriver as utils import os missing = set() idmapper = dict() for anid in allselected: path, postfix = utils.pairtreepath(anid, '/Volumes/TARDIS/work/ef/fic/') totalpath = path + postfix + '/' + utils.clean_pairtree(anid) + '.json.bz2' if not os.path.isfile(totalpath): if '$' in anid: newid = anid.replace('uc1.b', 'uc1.$b') else: newid = anid.replace('uc1.$b', 'uc1.b') path, postfix = utils.pairtreepath(newid, '/Volumes/TARDIS/work/ef/fic/') totalpath = path + postfix + '/' + utils.clean_pairtree(newid) + '.json.bz2' if os.path.isfile(totalpath): idmapper[anid] = totalpath else: missing.add(anid) else: idmapper[anid] = totalpath
for key, value in self.totalcounts.items(): writer.writerow([key, value / self.totaltokens]) writer.writerow(['#sentencelength', self.sentencelength]) writer.writerow(['#typetoken', self.typetoken]) writer.writerow(['#linelength', self.linelength]) def get_volume_features(self): ''' Just like write_volume_features, except we return them as a dictionary. ''' if self.totaltokens < 1: return Counter(), 0 else: return self.totalcounts, self.totaltokens if __name__ == "__main__": meta = pd.read_csv('/Users/tunder/Dropbox/python/train20/bzipmeta.csv', dtype = 'object', index_col = 'docid') for index, row in meta.iterrows(): inpath = row['filepath'] vol = VolumeFromJson(inpath, index) outpath = '/Volumes/TARDIS/work/train20/' + utils.clean_pairtree(index) + '.csv' vol.write_volume_features(outpath, override = True)
def get_meta(): meta = dict() meta['aubirth'] = input('Authors year of birth? ') meta['augender'] = input('Authors gender? ') meta['national origin'] = input('Authors nationality? ') meta['firstpub'] = input('Date of first publication? ') return meta reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1820-1839.csv' with open(reviews, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: htid = utils.clean_pairtree(row['HTid']) pubdate = int(row['date'][0:4]) firstpub = int(row['firstpub']) yrrev = int(row['yrrev']) if pubdate > yrrev + 5: date = yrrev print(str(pubdate) + " => " + str(yrrev)) else: date = pubdate jgenre = row['Jgenre'] if jgenre == 'poe': selecteddates[htid] = date selected.append(htid)
pathdict = dict() with open('../noveltmmeta/get_EF/ids2pathlist.tsv', encoding='utf-8') as f: for line in f: fields = line.strip().split('\t') pathdict[fields[0]] = fields[1] meta = pd.read_csv('topicsample.tsv', sep='\t') outrows = [] missing = 0 themissing = [] for d in meta.docid: cleand = utils.clean_pairtree(d) dollarless = d.replace('$', '') if d in pathdict: outrows.append((d, pathdict[d])) elif cleand in pathdict: outrows.append((cleand, pathdict[cleand])) elif dollarless in pathdict: outrows.append((dollarless, pathdict[dollarless])) else: missing += 1 themissing.append(d) with open('pathlist.tsv', mode='w', encoding='utf-8') as f: f.write('docid\tpath\n') for docid, path in outrows:
import csv import SonicScrewdriver as utils import random import os.path selecteddates = dict() selected = list() reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1880-1899_200.csv' with open(reviews, encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: htid = utils.clean_pairtree(row['HTid']) pubdate = int(row['date']) firstpub = int(row['firstpub']) yrrev = int(row['yrrev']) if pubdate > yrrev + 5: date = yrrev print(str(pubdate) + " => " + str(yrrev)) else: date = pubdate jgenre = row['Jgenre'] if jgenre == 'fic': selecteddates[htid] = date selected.append(htid)
def main(sourcedirs, metapath, modeldir, outpath, pairtree=False): ''' This function can be called from outside the module; it accepts path information and then iterates through all the files it finds in the metadata at "metapath." If the pairtree flag is True, we assume sourcedir is the root of a pairtree structure. Otherwise we assume it's a flat list. ''' # We're going to store all the models, by name, in a dictionary: models = [] modelpaths = glob.glob(modeldir + '*.p') assert len(modelpaths) == 1 model = loadamodel(modelpaths[0]) # Now get metadata. metadata = get_metadata(metapath) notfound = dict() c = 0 path = '' for docid in metadata.index: print(c) c += 1 if pairtree: found = False for sourcedir in sourcedirs: path = get_pairtree(sourcedir, docid) if os.path.isfile(path): found = True chosenpath = path if not found: print(path) print('file not found') error = 'file not found' wordcount = 0 else: pagecounts, pageids, error = get_counts_4pages( chosenpath, docid) else: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') pagecounts, pageids, error = pagecounts4file(path) if error == 'success': volumejson = predict_volume(model, pageids, pagecounts, docid) volumestring = json.dumps(volumejson) with open(outpath, mode='a', encoding='utf-8') as f: f.write(volumestring + '\n') print(docid) else: notfound[docid] = error print(docid, error) with open('fictionpagesnotfound.txt', mode='a', encoding='utf-8') as f: for vol, reason in notfound.items(): f.write(vol + '\t' + reason + '\n')
index_col='docid', sep='\t') ctr = 0 for index, row in meta.iterrows(): ctr += 1 if ctr % 100 == 1: print(ctr) inpath = rootpath + row['path'] if not os.path.isfile(inpath): missing += 1 print('missing') else: vol = VolumeFromJson(inpath, index) outpath = outfolder + utils.clean_pairtree(index) + '.txt' metarow = vol.write_volume(outpath, folder=outfolder, override=True, translator=translator, use_headers=False, skip_front=.1, skip_back=0.05) all_outrows.append(metarow) print(missing) columns = [ 'docid', 'htid', 'totaltokens', 'skipped_pages', 'trimmed_pages', 'path' ]
print('Done\a') elif args[1] == 'wholevolume': missing = 0 path_to_meta = args[2] meta = pd.read_csv(path_to_meta, dtype='object', index_col='docid', sep='\t') for index, row in meta.iterrows(): inpath = rootpath + row['path'] try: vol = VolumeFromJson(inpath, index) outpath = '../data/' + utils.clean_pairtree(index) + '.tsv' vol.write_volume_features([outpath], folder='../data/', override=True, translator=translator, use_headers=False, skip_front=0, skip_back=0) except: missing += 1 print(missing) elif args[1] == 'divided': path_to_meta = args[2]
def main(sourcedir, metapath, modeldir, outpath, pairtree = False): ''' This function can be called from outside the module; it accepts path information and then iterates through all the files it finds in the metadata at "metapath." If the pairtree flag is True, we assume sourcedir is the root of a pairtree structure. Otherwise we assume it's a flat list. ''' global allnames, top1000words alternatesource = '/projects/ichass/usesofscale/post23/englishmonographs1980-2016/' # We're going to store all the models, by name, in a dictionary: models = dict() modelpaths = glob.glob(modeldir + '*.p') for apath in modelpaths: name = apath.replace(modeldir, '') name = name.replace('.p', '') models[name] = loadamodel(apath) # Now get metadata. metadata = get_metadata(metapath) nonficprobs = [] juvieprobs = [] wordcounts = [] c = 0 for docid in metadata.index: print(c) c += 1 if pairtree: path1 = get_pairtree(sourcedir, docid) path2 = get_pairtree(alternatesource, docid) if os.path.isfile(path1): chosenpath = path1 elif os.path.isfile(path2): chosenpath = path2 else: print(path1) print(path2) print('file not found') error = 'file not found' wordcount = 0 counts, error, wordcount = counts4json(chosenpath, docid) else: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') counts, error, wordcount = counts4file(path) if error == 'success': nonficprob, juvenileprob = volume_classification(models, counts) else: nonficprob = 0.5 juvenileprob = 0.5 nonficprobs.append(nonficprob) juvieprobs.append(juvenileprob) wordcounts.append(wordcount) metadata.loc[ : , 'nonficprob'] = pd.Series(nonficprobs, index = metadata.index) metadata.loc[ : , 'juvenileprob'] = pd.Series(juvieprobs, index = metadata.index) metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index) metadata.to_csv(outpath)
metasource = pd.read_csv(args[1], sep='\t') missing = 0 docstoprocess = metasource.docid for idx, docid in enumerate(docstoprocess): if idx % 100 == 1: print(idx) if docid in translations: docid = translations[docid] path, postfix = utils.pairtreepath(docid, '') inpath = rootpath + path + postfix + '/' + utils.clean_pairtree( docid) + '.json.bz2' if os.path.isfile(inpath): pass elif 'uc1.b' in docid: newdoc = docid.replace('uc1.b', 'uc1.$b') path, postfix = utils.pairtreepath(newdoc, '') inpath = rootpath + path + postfix + '/' + utils.clean_pairtree( newdoc) + '.json.bz2' if os.path.isfile(inpath): translations[docid] = newdoc else: missing += 1 print(missing, inpath, 'not found.') else: missing += 1
fieldnames = reader.fieldnames for row in reader: inferred = forceint(row['inferreddate']) firstpub = forceint(row['firstpub']) if inferred < firstpub: print(row['author']) print(row['docid']) print('inferred: ' + str(inferred)) print('firstpub: ' + str(firstpub)) date = int(input('Date of first publication: ')) row['firstpub'] = str(date) if row['docid'] in existing: print('existing ' + row['docid']) existing.add(row['docid']) row['docid'] = utils.clean_pairtree(row['docid']) allrows.append(row) authornames.add(row['author']) authornames = list(authornames) synonyms = dict() for name in authornames: for anothername in authornames: if name == anothername: continue if name in synonyms: if synonyms[name] == anothername: continue
def get_meta(): meta = dict() meta['aubirth'] = input('Authors year of birth? ') meta['augender'] = input ('Authors gender? ') meta['national origin'] = input('Authors nationality? ') meta['firstpub'] = input('Date of first publication? ') return meta reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1860-1879_200.csv' with open(reviews, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: htid = utils.clean_pairtree(row['HTid']) pubdate = int(row['date'][0:4]) firstpub = int(row['firstpub']) yrrev = int(row['yrrev']) if pubdate > yrrev + 5: date = yrrev print(str(pubdate) + " => " + str(yrrev)) else: date = pubdate jgenre = row['Jgenre'] if jgenre == 'fic': selecteddates[htid] = date selected.append(htid)
'/Volumes/TARDIS/work/fullmeta/ic_monographs.tsv' ] genrecats = [ 'suspense', 'adventure', 'western', 'mystery', 'detective', 'science fiction', 'fantasy', 'horror', 'gothic', 'romance', 'pulp' ] doublets = [] for afile in tocheck: with open(afile, encoding='utf-8') as f: reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: docid = row['docid'] alternative = utils.clean_pairtree(docid) if docid in docs2categorize: dothis = True d = docid elif alternative in docs2categorize: dothis = True d = alternative else: dothis = False if dothis: g = row['genres'].lower() + " " + row['subjects'].lower() genre = 'none' for cat in genrecats: if cat in g: genre = 'genre'
# getidstoadd import SonicScrewdriver as utils import os with open('/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding = 'utf-8') as f: filelines = f.readlines() ids2get = [x.split('\t')[0] for x in filelines] fileswehave = os.listdir('/Users/tunder/Dropbox/GenreProject/python/granger/elite/') idswehave = set([x.replace('.poe.tsv','') for x in fileswehave if x.endswith('.poe.tsv')]) with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode = 'w', encoding = 'utf-8') as f: for anid in ids2get: if anid not in idswehave and utils.clean_pairtree(anid) not in idswehave: f.write(utils.dirty_pairtree(anid) + '\n')
# getidstoadd import SonicScrewdriver as utils import os with open( '/Users/tunder/Dropbox/GenreProject/python/granger/correctedmeta.tsv', encoding='utf-8') as f: filelines = f.readlines() ids2get = [x.split('\t')[0] for x in filelines] fileswehave = os.listdir( '/Users/tunder/Dropbox/GenreProject/python/granger/elite/') idswehave = set( [x.replace('.poe.tsv', '') for x in fileswehave if x.endswith('.poe.tsv')]) with open('/Users/tunder/Dropbox/GenreProject/python/granger/ids2get.tsv', mode='w', encoding='utf-8') as f: for anid in ids2get: if anid not in idswehave and utils.clean_pairtree( anid) not in idswehave: f.write(utils.dirty_pairtree(anid) + '\n')
# a newer metadata set. import csv import SonicScrewdriver as utils import random selecteddates = dict() selected = list() reviews = '/Users/tunder/Dropbox/ted/reception/reviewed/lists/ReviewedTitles1840-1859_200.csv' with open(reviews, encoding = 'utf-8') as f: reader = csv.DictReader(f) for row in reader: htid = utils.clean_pairtree(row['HTid']) pubdate = int(row['date']) firstpub = int(row['firstpub']) yrrev = int(row['yrrev']) if pubdate > yrrev + 5: date = yrrev print(str(pubdate) + " => " + str(yrrev)) else: date = pubdate jgenre = row['Jgenre'] if jgenre == 'poe': selecteddates[htid] = date selected.append(htid)
def main(sourcedir, metapath, modeldir, outpath, pairtree=False): ''' This function can be called from outside the module; it accepts path information and then iterates through all the files it finds in the metadata at "metapath." If the pairtree flag is True, we assume sourcedir is the root of a pairtree structure. Otherwise we assume it's a flat list. ''' global allnames, top1000words # We're going to store all the models, by name, in a dictionary: models = dict() for name in allnames: models[name] = loadamodel(modeldir + name) # Now get metadata. metadata = get_metadata(metapath) predictedgenres = [] predictedprobs = [] explanations = [] wordcounts = [] englishpcts = [] c = 0 for docid in metadata.index: print(c) c += 1 if pairtree: path = get_pairtree(sourcedir, docid) counts, error, wordcount = counts4json(path, docid) else: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') counts, error, wordcount = counts4file(path) if error == 'success': genredict = make_genredict(metadata, docid) englishpct = get_english_percent(counts, top1000words) genre, probability, explanation = volume_classification( models, counts, genredict) else: englishpct = 0 genre = 'NA' probability = 0 explanation = error predictedgenres.append(genre) predictedprobs.append(probability) explanations.append(explanation) wordcounts.append(wordcount) englishpcts.append(englishpct) metadata.loc[:, 'predictedgenre'] = pd.Series(predictedgenres, index=metadata.index) metadata.loc[:, 'probability'] = pd.Series(predictedprobs, index=metadata.index) metadata.loc[:, 'wordcount'] = pd.Series(wordcounts, index=metadata.index) metadata.loc[:, 'englishpct'] = pd.Series(englishpcts, index=metadata.index) metadata.loc[:, 'explanation'] = pd.Series(explanations, index=metadata.index) metadata.to_csv(outpath)
def main(sourcedir, metapath, modeldir, outpath, pairtree = False): ''' This function can be called from outside the module; it accepts path information and then iterates through all the files it finds in the metadata at "metapath." If the pairtree flag is True, we assume sourcedir is the root of a pairtree structure. Otherwise we assume it's a flat list. ''' global allnames, top1000words # We're going to store all the models, by name, in a dictionary: models = dict() for name in allnames: models[name] = loadamodel(modeldir + name) # Now get metadata. metadata = get_metadata(metapath) predictedgenres = [] predictedprobs = [] explanations = [] wordcounts = [] englishpcts = [] c = 0 for docid in metadata.index: print(c) c += 1 if pairtree: path = get_pairtree(sourcedir, docid) counts, error, wordcount = counts4json(path, docid) else: path = os.path.join(sourcedir, utils.clean_pairtree(docid) + '.csv') counts, error, wordcount = counts4file(path) if error == 'success': genredict = make_genredict(metadata, docid) englishpct = get_english_percent(counts, top1000words) genre, probability, explanation = volume_classification(models, counts, genredict) else: englishpct = 0 genre = 'NA' probability = 0 explanation = error predictedgenres.append(genre) predictedprobs.append(probability) explanations.append(explanation) wordcounts.append(wordcount) englishpcts.append(englishpct) metadata.loc[ : , 'predictedgenre'] = pd.Series(predictedgenres, index = metadata.index) metadata.loc[ : , 'probability'] = pd.Series(predictedprobs, index = metadata.index) metadata.loc[ : , 'wordcount'] = pd.Series(wordcounts, index = metadata.index) metadata.loc[ : , 'englishpct'] = pd.Series(englishpcts, index = metadata.index) metadata.loc[ : , 'explanation'] = pd.Series(explanations, index = metadata.index) metadata.to_csv(outpath)
aliases[row['alias']] = row['ourname'] # find matches key2docid = dict() volbackup = dict() with open('../metadata/filtered_fiction_plus_18c.tsv', encoding='utf-8') as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: try: intval = int(row['docid']) docid = row['docid'] except: docid = utils.clean_pairtree(row['docid']) possibleauth = row['author'] possibletitle = normalize_title(row['title']) found = False for key, value in genremeta.items(): author, title, normauth = key if match(normauth, title, possibleauth, possibletitle): key2docid[key] = docid volbackup[key] = utils.clean_pairtree(row['volid']) found = True print('Found: ', possibleauth, author, possibletitle) break print('Found a total of ', len(key2docid))
list_of_dataframes = [] idset = set() list_of_files = args[1:] root = '../rawdata/' list_of_paths = [root + x for x in list_of_files] for p in list_of_paths: df = pd.read_csv(p, index_col='docid') list_of_dataframes.append(df) idset = idset | set(df.index) ids = [] for anid in idset: ids.append(utils.clean_pairtree(str(anid))) allpaths = set() with open('/Volumes/TARDIS/work/ef/htrc-ef-all-files.txt', encoding='utf-8') as f: for line in f: line = line.strip() allpaths.add(line) missing = set() found = set() mapping = dict() path2id = dict() #things we already have:
nonusa += 1 if gender == 'f': female += 1 elif gender == 'm': male += 1 bydate = dict() fictionmetadata = dict() datesbydocid = dict() with open('/Users/tunder/work/genre/metadata/ficmeta.csv', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: docid = utils.clean_pairtree(row['htid']) fictionmetadata[docid] = row date = utils.date_row(row) datesbydocid[docid] = date if docid in selected: continue if date in bydate: bydate[date].append(docid) else: bydate[date] = [docid] controlset = set() controlmeta = dict() usedfromselected = list()