# Generate Cotraining Set

# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.

import SonicScrewdriver as utils
from shutil import copyfile

indices, columns, metadata = utils.readtsv("/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")

toget = indices[-200:]

toget = [utils.pairtreefile(x) for x in toget]

genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"

for htid in toget:

	featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
	featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
	copyfile(featuresource, featuredestination)

	genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
	genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
	with open(genresource, mode="r", encoding = "utf-8") as f:
		filelines = f.readlines()

	with open(genredestination, mode="w", encoding = "utf-8") as f:
		for line in filelines:
			line = line.rstrip()
import os
import SonicScrewdriver as utils

folder = "/Users/tunder/Dropbox/pagedata/thirdfeatures/pagefeatures/"
files = os.listdir(folder)

validfiles = set()
for filename in files:
    if not filename.startswith(".") and len(filename) > 7:
        filename = filename[:-7]
        validfiles.add(filename)

otherfolder = "/Volumes/TARDIS/output/slices/"

slices = os.listdir(otherfolder)
slicefiles = set()

for aslice in slices:
    if aslice.startswith("."):
        continue
    with open(otherfolder + aslice, encoding="utf-8") as f:
        fl = f.readlines()
    for line in fl:
        line = line.rstrip()
        line = utils.pairtreefile(line)
        slicefiles.add(line)

print(slicefiles.intersection(validfiles))
Exemple #3
0
import sys, os
import SonicScrewdriver as utils
from shutil import copyfile

with open("/Users/tunder/Dropbox/pagedata/activelearn/sourcefile.txt", mode="r", encoding="utf-8") as f:
	filelines = f.readlines()

files = list()
for line in filelines:
	files.append(line.rstrip())

with open("/Users/tunder/Dropbox/pagedata/activelearn/learn1.arff", mode="w", encoding="utf-8") as f:
	f.write("% List of files in associated folder.\n")
	f.write("% Does not really use arff format.\n")
	f.write("\n")
	f.write("@RELATION learn1\n\n")
	f.write("@ATTRIBUTE htid string\n")
	f.write("@ATTRIBUTE endpg numeric\n")
	f.write("@ATTRIBUTE startpgpart numeric\n")
	f.write("@ATTRIBUTE endpgpart numeric\n")
	f.write("@ATTRIBUTE probability numeric\n")
	f.write("\n")

	for afile in files:
		outline = utils.pairtreefile(afile) + ",0,0,0,0,0\n"
		f.write(outline)
		sourcepath = "/Volumes/TARDIS/work/cotrain/texts/" + utils.pairtreefile(afile) + ".norm.txt"
		destination = "/Users/tunder/Dropbox/pagedata/activelearn/" + utils.pairtreefile(afile) + ".txt"
		copyfile(sourcepath, destination)
		
        if os.path.isfile(filepath):
            tokencount, wordcount = count_words(filepath)
        else:
            print("Missing file: " + filepath)
            sys.exit(0)
        newrow = [idcode, date, tokencount, wordcount, author, title]
        outtable.append(newrow)
        print(counter)
        counter += 1

rows, columns, table = utils.readtsv('/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')

sourcedir = "/Volumes/TARDIS/work/moneytexts/"

for row in rows:
    filename = utils.pairtreefile(row) + ".fic.txt"
    filepath = os.path.join(sourcedir, filename)
    if os.path.isfile(filepath):
        tokencount, wordcount = count_words(filepath)
    else:
        print("Missing file: " + filepath)
        sys.exit(0)

    idcode = table["HTid"][row]
    date = str(utils.simple_date(row, table))
    author = table["author"][row]
    title = table["title"][row]
    newrow = [idcode, date, tokencount, wordcount, author, title]
    outtable.append(newrow)
    print(counter)
    counter += 1
    # I'm not repeating these columns, because the first is not useful and the second
    # is not reliable.

    outrow = [htid]
    for column in columns[1:]:
        if column not in columns_to_exclude:
            outrow.append(table[column][dirtyhtid])

    return outrow

metadata_path = '/Volumes/TARDIS/work/metadata/MergedMonographs.tsv'
rows, columns, table = utils.readtsv(metadata_path)

indextorows = dict()
for row in rows:
    cleanid = utils.pairtreefile(row)
    newrow = make_row(cleanid, row, columns, table)
    indextorows[cleanid] = newrow

for genreabbrev, genre in genrenames.items():

    print(genre)

    genrepath = os.path.join(rootpath, genre)

    volsinsubset = list()
    # Because there are some volumes in the metadata that weren't
    # included in the 95-percent subset. Those won't be present
    # as files, and shouldn't be carried forward to the next stage.
    metadataforgenre = dict()
            tokencount, wordcount = count_words(filepath)
        else:
            print("Missing file: " + filepath)
            sys.exit(0)
        newrow = [idcode, date, tokencount, wordcount, author, title]
        outtable.append(newrow)
        print(counter)
        counter += 1

rows, columns, table = utils.readtsv(
    '/Users/tunder/Dropbox/GenreProject/metadata/topicmodelingsample.tsv')

sourcedir = "/Volumes/TARDIS/work/moneytexts/"

for row in rows:
    filename = utils.pairtreefile(row) + ".fic.txt"
    filepath = os.path.join(sourcedir, filename)
    if os.path.isfile(filepath):
        tokencount, wordcount = count_words(filepath)
    else:
        print("Missing file: " + filepath)
        sys.exit(0)

    idcode = table["HTid"][row]
    date = str(utils.simple_date(row, table))
    author = table["author"][row]
    title = table["title"][row]
    newrow = [idcode, date, tokencount, wordcount, author, title]
    outtable.append(newrow)
    print(counter)
    counter += 1
# Generate Cotraining Set

# This script uses a set of volumes already classified and sorted by a model
# in order to generate additional training data for a new model.

import SonicScrewdriver as utils
from shutil import copyfile

indices, columns, metadata = utils.readtsv(
    "/Volumes/TARDIS/work/cotrain/sortedcotrain.tsv")

toget = indices[-200:]

toget = [utils.pairtreefile(x) for x in toget]

genredir = "/Volumes/TARDIS/work/cotrain/top200/genremaps/"
featuredir = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/"

for htid in toget:

    featuresource = "/Volumes/TARDIS/work/cotrain/pagefeatures/" + htid + ".pg.tsv"
    featuredestination = "/Volumes/TARDIS/work/cotrain/top200/pagefeatures/" + htid + ".pg.tsv"
    copyfile(featuresource, featuredestination)

    genresource = "/Volumes/TARDIS/work/cotrain/predictions/" + htid + ".predict"
    genredestination = "/Volumes/TARDIS/work/cotrain/top200/genremaps/" + htid + ".map"
    with open(genresource, mode="r", encoding="utf-8") as f:
        filelines = f.readlines()

    with open(genredestination, mode="w", encoding="utf-8") as f:
        for line in filelines:
import os
import SonicScrewdriver as utils

folder = "/Users/tunder/Dropbox/pagedata/thirdfeatures/pagefeatures/"
files = os.listdir(folder)

validfiles = set()
for filename in files:
	if not filename.startswith(".") and len(filename) > 7:
		filename = filename[:-7]
		validfiles.add(filename)

otherfolder = "/Volumes/TARDIS/output/slices/"

slices = os.listdir(otherfolder)
slicefiles = set()

for aslice in slices:
	if aslice.startswith("."):
		continue
	with open(otherfolder + aslice, encoding="utf-8") as f:
		fl = f.readlines()
	for line in fl:
		line = line.rstrip()
		line = utils.pairtreefile(line)
		slicefiles.add(line)

print(slicefiles.intersection(validfiles))