# Generate Cotraining Set # This script uses a set of volumes already classified and sorted by a model # in order to generate additional training data for a new model. import SonicScrewdriver as utils from shutil import copyfile indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv") toget = indices[-200:] toget = [utils.pairtreefile(x) for x in toget] genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" for htid in toget: featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv" featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv" copyfile(featuresource, featuredestination) genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict" genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map" with open(genresource, mode="r", encoding = "utf-8") as f: filelines = f.readlines() with open(genredestination, mode="w", encoding = "utf-8") as f: for line in filelines: line = line.rstrip()
import os import SonicScrewdriver as utils folder = "/Users/tunder/Dropbox/pagedata/thirdfeatures/pagefeatures/" files = os.listdir(folder) validfiles = set() for filename in files: if not filename.startswith(".") and len(filename) > 7: filename = filename[:-7] validfiles.add(filename) otherfolder = "/Volumes/TARDIS/output/slices/" slices = os.listdir(otherfolder) slicefiles = set() for aslice in slices: if aslice.startswith("."): continue with open(otherfolder + aslice, encoding="utf-8") as f: fl = f.readlines() for line in fl: line = line.rstrip() line = utils.pairtreefile(line) slicefiles.add(line) print(slicefiles.intersection(validfiles))
import sys, os import SonicScrewdriver as utils from shutil import copyfile with open("/Users/tunder/Dropbox/pagedata/activelearn/sourcefile.txt", mode="r", encoding="utf-8") as f: filelines = f.readlines() files = list() for line in filelines: files.append(line.rstrip()) with open("/Users/tunder/Dropbox/pagedata/activelearn/learn1.arff", mode="w", encoding="utf-8") as f: f.write("% List of files in associated folder.\n") f.write("% Does not really use arff format.\n") f.write("\n") f.write("@RELATION learn1\n\n") f.write("@ATTRIBUTE htid string\n") f.write("@ATTRIBUTE endpg numeric\n") f.write("@ATTRIBUTE startpgpart numeric\n") f.write("@ATTRIBUTE endpgpart numeric\n") f.write("@ATTRIBUTE probability numeric\n") f.write("\n") for afile in files: outline = utils.pairtreefile(afile) + ",0,0,0,0,0\n" f.write(outline) sourcepath = "/Volumes/TARDIS/work/cotrain/texts/" + utils.pairtreefile(afile) + ".norm.txt" destination = "/Users/tunder/Dropbox/pagedata/activelearn/" + utils.pairtreefile(afile) + ".txt" copyfile(sourcepath, destination)
if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1 rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv') sourcedir = "/Volumes/TARDIS/work/moneytexts/" for row in rows: filename = utils.pairtreefile(row) + ".fic.txt" filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) idcode = table["HTid"][row] date = str(utils.simple_date(row, table)) author = table["author"][row] title = table["title"][row] newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1
# I'm not repeating these columns, because the first is not useful and the second # is not reliable. outrow = [htid] for column in columns[1:]: if column not in columns_to_exclude: outrow.append(table[column][dirtyhtid]) return outrow metadata_path = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv' rows, columns, table = utils.readtsv(metadata_path) indextorows = dict() for row in rows: cleanid = utils.pairtreefile(row) newrow = make_row(cleanid, row, columns, table) indextorows[cleanid] = newrow for genreabbrev, genre in genrenames.items(): print(genre) genrepath = os.path.join(rootpath, genre) volsinsubset = list() # Because there are some volumes in the metadata that weren't # included in the 95-percent subset. Those won't be present # as files, and shouldn't be carried forward to the next stage. metadataforgenre = dict()
tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1 rows, columns, table = utils.readtsv( '/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv') sourcedir = "/Volumes/TARDIS/work/moneytexts/" for row in rows: filename = utils.pairtreefile(row) + ".fic.txt" filepath = os.path.join(sourcedir, filename) if os.path.isfile(filepath): tokencount, wordcount = count_words(filepath) else: print("Missing file: " + filepath) sys.exit(0) idcode = table["HTid"][row] date = str(utils.simple_date(row, table)) author = table["author"][row] title = table["title"][row] newrow = [idcode, date, tokencount, wordcount, author, title] outtable.append(newrow) print(counter) counter += 1
# Generate Cotraining Set # This script uses a set of volumes already classified and sorted by a model # in order to generate additional training data for a new model. import SonicScrewdriver as utils from shutil import copyfile indices, columns, metadata = utils.readtsv( "/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv") toget = indices[-200:] toget = [utils.pairtreefile(x) for x in toget] genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" for htid in toget: featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv" featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv" copyfile(featuresource, featuredestination) genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict" genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map" with open(genresource, mode="r", encoding="utf-8") as f: filelines = f.readlines() with open(genredestination, mode="w", encoding="utf-8") as f: for line in filelines: