def get_donttrainset(all_positives, positive_tags, metadict, donttrainconditions, datetype): ''' This function identifies positive volumes that are not to be included in a training set, because they belong to a category that is being tested only. ''' donttrainset = set() pastthreshold, futurethreshold = get_thresholds(donttrainconditions) for posvol in all_positives: date = metautils.infer_date(metadict[posvol], datetype) if date < pastthreshold or date > futurethreshold: donttrainset.add(posvol) continue tagset = metadict[posvol]['tagset'] hasexclusion = False hasotherpositive = False for tag in positive_tags: if tag in tagset and not tag in donttrainconditions: hasotherpositive = True for tag in donttrainconditions: if tag in tagset: hasexclusion = True if hasexclusion and not hasotherpositive: donttrainset.add(posvol) # The following paragraph allows us to limit the size of the # donttrainset by including a tag like "limit==250" for tag in donttrainconditions: if 'limit==' in tag: limit = int(tag.replace('limit==', '')) if limit < len(donttrainset): donttrainset = set(random.sample(donttrainset, limit)) return donttrainset
def create_model(paths, exclusions, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, metadatapath, outputpath, vocabpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions verbose = False holdout_authors = True # If you want reliable results, always run this with holdout_authors # set to True. The only reason to set it to False is to confirm that # this flag is actually making a difference. If you do that, it # disables the code that keeps other works by the author being predicted # out of the training set. # The following function confirms that the testconditions are legal. confirm_testconditions(testconditions, positive_tags) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. if type(positive_tags[0]).__name__ == 'int': categorytodivide = 'firstpub' else: categorytodivide = 'tagset' IDsToUse, classdictionary, donttrainset = metafilter.label_classes( metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions) print() min, max = first_and_last(IDsToUse, metadict, datetype) if min > 0: print("The whole corpus involved here includes " + str(len(IDsToUse))) print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".") print() # We now create an ordered list of id-path tuples for later use, and identify a set of # positive ids that should never be used in training. volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse) # Extend the set of ids not to be used in training by identifying negative volumes that match # the distribution of positive volumes. describe_donttrainset(donttrainset, classdictionary, metadict, datetype) # Create a flag for each volume that indicates whether it was used in training record_trainflags(metadict, donttrainset) # Get a count of docfrequency for all words in the corpus. This is probably not needed and # might be deprecated later. wordcounts = get_docfrequency(volspresent, donttrainset) # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. vocablist = get_vocablist(vocabpath, volspresent, wordcounts, useall=True, n=numfeatures) # This function either gets the vocabulary list already stored in vocabpath, or # creates a list of the top 10k words in all files, and stores it there. # N is a parameter that could be altered right here. # Useall is a parameter that you basically don't need to worry about unless # you're changing / testing code. If you set it to false, the vocablist will # exclude words that occur very rarely. This shouldn't be necessary; the # crossvalidation routine is designed not to include features that occur # zero times in the training set. But if you get div-by-zero errors in the # training process, you could fiddle with this parameter as part of a # troubleshooting process. numfeatures = len(vocablist) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. # This list will include for ALL volumes, the indexes of vols in the donttrainset. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[ idx1]: authormatches[idx1].append(idx2) else: # This code only runs if we're testing the effect of # holdout_authors by disabling it. for idx1, anid in enumerate(orderedIDs): if idx1 not in authormatches[idx1]: authormatches[idx1].append(idx1) # The purpose of everything that follows is to # balance negative and positive instances in each # training set. trainingpositives = set() trainingnegatives = set() for anid, thisclass in classdictionary.items(): if anid in donttrainset: continue if thisclass == 1: trainingpositives.add(orderedIDs.index(anid)) else: trainingnegatives.add(orderedIDs.index(anid)) print('Training positives: ' + str(len(trainingpositives))) print('Training negatives: ' + str(len(trainingnegatives))) # The code below was intended to balance the size of positive and # negative in spite of same-author exclusions. But it could # have grossly unintended effects when there were many donttrainon # exclusions. # for alist in authormatches: # numpositive = 0 # numnegative = 0 # for anidx in alist: # anid = orderedIDs[anidx] # thisclass = classdictionary[anid] # if thisclass == 1: # numpositive += 1 # else: # numnegative += 1 # if numpositive > numnegative: # difference = numpositive - numnegative # remaining = trainingnegatives - set(alist) # alist.extend(random.sample(remaining, difference)) # elif numpositive < numnegative: # difference = numnegative - numpositive # remaining = trainingpositives - set(alist) # alist.extend(random.sample(remaining, difference)) # else: # difference = 0 # Let's record, for each volume, the size of its training set. trainingsizes = [] numvolumes = len(orderedIDs) for idx, anid in enumerate(orderedIDs): excluded = len(authormatches[idx]) metadict[anid]['trainsize'] = numvolumes - excluded trainingsizes.append(metadict[anid]['trainsize']) averagetrainingsize = sum(trainingsizes) / len(trainingsizes) for alist in authormatches: alist.sort(reverse=True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding='utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = metautils.infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) if totalcount == 0: totalcount = .00001 voldata.append(features / totalcount) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) sextuplets = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] asixtuple = data, classvector, listtoexclude, i, usedate, regularization sextuplets.append(asixtuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes=11) res = pool.map_async(modelingprocess.model_one_volume, sextuplets) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode='w', encoding='utf-8') as f: writer = csv.writer(f) header = [ 'volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'trainflag', 'trainsize', 'author', 'title', 'genretags' ] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] dateused = metadata[datetype] pubdate = metadata['pubdate'] birthdate = metadata['birthdate'] firstpub = metadata['firstpub'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] trainflag = metadata['trainflag'] trainsize = metadata['trainsize'] genretags = ' | '.join(metadata['tagset']) outrow = [ volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, trainflag, trainsize, author, title, genretags ] writer.writerow(outrow) allvolumes.append(outrow) if logistic == 0.5: print("equals!") predictedpositive = random.sample([True, False], 1)[0] elif logistic > 0.5: predictedpositive = True elif logistic < 0.5: predictedpositive = False else: print('Oh, joy. A fundamental floating point error.') predictedpositive = random.sample([True, False], 1)[0] if predictedpositive and classdictionary[volid] > 0.5: truepositives += 1 elif not predictedpositive and classdictionary[volid] < 0.5: truenegatives += 1 elif not predictedpositive and classdictionary[volid] > 0.5: falsenegatives += 1 elif predictedpositive and classdictionary[volid] < 0.5: falsepositives += 1 else: print("Wait a second, boss.") donttrainon.sort(reverse=True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) trainingset, testset = modelingprocess.remove_zerocols( trainingset, testset) newmodel = LogisticRegression(C=regularization) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 100 coefficientuples = list( zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives if totalevaluated != len(IDsToUse): print("Total evaluated = " + str(totalevaluated)) print("But we've got " + str(len(IDsToUse))) accuracy = (truepositives + truenegatives) / totalevaluated print('True positives ' + str(truepositives)) print('True negatives ' + str(truenegatives)) print('False positives ' + str(falsepositives)) print('False negatives ' + str(falsenegatives)) print() print('The average size of the training set was ' + str(averagetrainingsize)) print() precision = truepositives / (truepositives + falsepositives) recall = truepositives / (truepositives + falsenegatives) F1 = 2 * (precision * recall) / (precision + recall) print("F1 : " + str(F1)) coefficientpath = outputpath.replace('.csv', '.coefs.csv') with open(coefficientpath, mode='w', encoding='utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples
def create_model(paths, exclusions, classifyconditions): ''' This is the main function in the module. It can be called externally; it's also called if the module is run directly. ''' sourcefolder, extension, metadatapath, outputpath, vocabpath = paths excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions positive_tags, negative_tags, datetype, numfeatures, regularization, testconditions = classifyconditions verbose = False holdout_authors = True # If you want reliable results, always run this with holdout_authors # set to True. The only reason to set it to False is to confirm that # this flag is actually making a difference. If you do that, it # disables the code that keeps other works by the author being predicted # out of the training set. # The following function confirms that the testconditions are legal. confirm_testconditions(testconditions, positive_tags) if not sourcefolder.endswith('/'): sourcefolder = sourcefolder + '/' # This just makes things easier. # Get a list of files. allthefiles = os.listdir(sourcefolder) # random.shuffle(allthefiles) volumeIDs = list() volumepaths = list() for filename in allthefiles: if filename.endswith(extension): volID = filename.replace(extension, "") # The volume ID is basically the filename minus its extension. # Extensions are likely to be long enough that there is little # danger of accidental occurrence inside a filename. E.g. # '.fic.tsv' path = sourcefolder + filename volumeIDs.append(volID) volumepaths.append(path) metadict = metafilter.get_metadata(metadatapath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove) # Now that we have a list of volumes with metadata, we can select the groups of IDs # that we actually intend to contrast. if type(positive_tags[0]).__name__ == 'int': categorytodivide = 'firstpub' else: categorytodivide = 'tagset' IDsToUse, classdictionary, donttrainset = metafilter.label_classes(metadict, categorytodivide, positive_tags, negative_tags, sizecap, datetype, excludeif, testconditions) print() min, max = first_and_last(IDsToUse, metadict, datetype) if min > 0: print("The whole corpus involved here includes " + str(len(IDsToUse))) print("volumes, ranging in date from " + str(min) + " to " + str(max) + ".") print() # We now create an ordered list of id-path tuples for later use, and identify a set of # positive ids that should never be used in training. volspresent, orderedIDs = get_volume_lists(volumeIDs, volumepaths, IDsToUse) # Extend the set of ids not to be used in training by identifying negative volumes that match # the distribution of positive volumes. describe_donttrainset(donttrainset, classdictionary, metadict, datetype) # Create a flag for each volume that indicates whether it was used in training record_trainflags(metadict, donttrainset) # Get a count of docfrequency for all words in the corpus. This is probably not needed and # might be deprecated later. wordcounts = get_docfrequency(volspresent, donttrainset) # The feature list we use is defined by the top 10,000 words (by document # frequency) in the whole corpus, and it will be the same for all models. vocablist = get_vocablist(vocabpath, volspresent, wordcounts, useall = True, n = numfeatures) # This function either gets the vocabulary list already stored in vocabpath, or # creates a list of the top 10k words in all files, and stores it there. # N is a parameter that could be altered right here. # Useall is a parameter that you basically don't need to worry about unless # you're changing / testing code. If you set it to false, the vocablist will # exclude words that occur very rarely. This shouldn't be necessary; the # crossvalidation routine is designed not to include features that occur # zero times in the training set. But if you get div-by-zero errors in the # training process, you could fiddle with this parameter as part of a # troubleshooting process. numfeatures = len(vocablist) # For each volume, we're going to create a list of volumes that should be # excluded from the training set when it is to be predicted. More precisely, # we're going to create a list of their *indexes*, so that we can easily # remove rows from the training matrix. # This list will include for ALL volumes, the indexes of vols in the donttrainset. donttrainon = [orderedIDs.index(x) for x in donttrainset] authormatches = [list(donttrainon) for x in range(len(orderedIDs))] # Now we proceed to enlarge that list by identifying, for each volume, # a set of indexes that have the same author. Obvs, there will always be at least one. # We exclude a vol from it's own training set. if holdout_authors: for idx1, anid in enumerate(orderedIDs): thisauthor = metadict[anid]['author'] for idx2, anotherid in enumerate(orderedIDs): otherauthor = metadict[anotherid]['author'] if thisauthor == otherauthor and not idx2 in authormatches[idx1]: authormatches[idx1].append(idx2) else: # This code only runs if we're testing the effect of # holdout_authors by disabling it. for idx1, anid in enumerate(orderedIDs): if idx1 not in authormatches[idx1]: authormatches[idx1].append(idx1) # The purpose of everything that follows is to # balance negative and positive instances in each # training set. trainingpositives = set() trainingnegatives = set() for anid, thisclass in classdictionary.items(): if anid in donttrainset: continue if thisclass == 1: trainingpositives.add(orderedIDs.index(anid)) else: trainingnegatives.add(orderedIDs.index(anid)) print('Training positives: ' + str(len(trainingpositives))) print('Training negatives: ' + str(len(trainingnegatives))) # The code below was intended to balance the size of positive and # negative in spite of same-author exclusions. But it could # have grossly unintended effects when there were many donttrainon # exclusions. # for alist in authormatches: # numpositive = 0 # numnegative = 0 # for anidx in alist: # anid = orderedIDs[anidx] # thisclass = classdictionary[anid] # if thisclass == 1: # numpositive += 1 # else: # numnegative += 1 # if numpositive > numnegative: # difference = numpositive - numnegative # remaining = trainingnegatives - set(alist) # alist.extend(random.sample(remaining, difference)) # elif numpositive < numnegative: # difference = numnegative - numpositive # remaining = trainingpositives - set(alist) # alist.extend(random.sample(remaining, difference)) # else: # difference = 0 # Let's record, for each volume, the size of its training set. trainingsizes = [] numvolumes = len(orderedIDs) for idx, anid in enumerate(orderedIDs): excluded = len(authormatches[idx]) metadict[anid]['trainsize'] = numvolumes - excluded trainingsizes.append(metadict[anid]['trainsize']) averagetrainingsize = sum(trainingsizes) / len(trainingsizes) for alist in authormatches: alist.sort(reverse = True) # I am reversing the order of indexes so that I can delete them from # back to front, without changing indexes yet to be deleted. # This will become important in the modelingprocess module. volsizes = dict() voldata = list() classvector = list() for volid, volpath in volspresent: with open(volpath, encoding = 'utf-8') as f: voldict = dict() totalcount = 0 for line in f: fields = line.strip().split('\t') if len(fields) > 2 or len(fields) < 2: continue word = fields[0] count = int(fields[1]) voldict[word] = count totalcount += count date = metautils.infer_date(metadict[volid], datetype) date = date - 1700 if date < 0: date = 0 if usedate: features = get_features_with_date(voldict, vocablist, date, totalcount) voldata.append(features) else: features = get_features(voldict, vocablist) if totalcount == 0: totalcount = .00001 voldata.append(features / totalcount) volsizes[volid] = totalcount classflag = classdictionary[volid] classvector.append(classflag) data = pd.DataFrame(voldata) sextuplets = list() for i, volid in enumerate(orderedIDs): listtoexclude = authormatches[i] asixtuple = data, classvector, listtoexclude, i, usedate, regularization sextuplets.append(asixtuple) # Now do leave-one-out predictions. print('Beginning multiprocessing.') pool = Pool(processes = 11) res = pool.map_async(modelingprocess.model_one_volume, sextuplets) # After all files are processed, write metadata, errorlog, and counts of phrases. res.wait() resultlist = res.get() assert len(resultlist) == len(orderedIDs) logisticpredictions = dict() for i, volid in enumerate(orderedIDs): logisticpredictions[volid] = resultlist[i] pool.close() pool.join() print('Multiprocessing concluded.') truepositives = 0 truenegatives = 0 falsepositives = 0 falsenegatives = 0 allvolumes = list() with open(outputpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) header = ['volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'trainflag', 'trainsize', 'author', 'title', 'genretags'] writer.writerow(header) for volid in IDsToUse: metadata = metadict[volid] dateused = metadata[datetype] pubdate = metadata['pubdate'] birthdate = metadata['birthdate'] firstpub = metadata['firstpub'] gender = metadata['gender'] nation = metadata['nation'] author = metadata['author'] title = metadata['title'] allwords = volsizes[volid] logistic = logisticpredictions[volid] realclass = classdictionary[volid] trainflag = metadata['trainflag'] trainsize = metadata['trainsize'] genretags = ' | '.join(metadata['tagset']) outrow = [volid, dateused, pubdate, birthdate, firstpub, gender, nation, allwords, logistic, realclass, trainflag, trainsize, author, title, genretags] writer.writerow(outrow) allvolumes.append(outrow) if logistic == 0.5: print("equals!") predictedpositive = random.sample([True, False], 1)[0] elif logistic > 0.5: predictedpositive = True elif logistic < 0.5: predictedpositive = False else: print('Oh, joy. A fundamental floating point error.') predictedpositive = random.sample([True, False], 1)[0] if predictedpositive and classdictionary[volid] > 0.5: truepositives += 1 elif not predictedpositive and classdictionary[volid] < 0.5: truenegatives += 1 elif not predictedpositive and classdictionary[volid] > 0.5: falsenegatives += 1 elif predictedpositive and classdictionary[volid] < 0.5: falsepositives += 1 else: print("Wait a second, boss.") donttrainon.sort(reverse = True) trainingset, yvals, testset = sliceframe(data, classvector, donttrainon, 0) trainingset, testset = modelingprocess.remove_zerocols(trainingset, testset) newmodel = LogisticRegression(C = regularization) trainingset, means, stdevs = normalizearray(trainingset, usedate) newmodel.fit(trainingset, yvals) coefficients = newmodel.coef_[0] * 100 coefficientuples = list(zip(coefficients, (coefficients / np.array(stdevs)), vocablist + ['pub.date'])) coefficientuples.sort() if verbose: for coefficient, normalizedcoef, word in coefficientuples: print(word + " : " + str(coefficient)) print() totalevaluated = truepositives + truenegatives + falsepositives + falsenegatives if totalevaluated != len(IDsToUse): print("Total evaluated = " + str(totalevaluated)) print("But we've got " + str(len(IDsToUse))) accuracy = (truepositives + truenegatives) / totalevaluated print('True positives ' + str(truepositives)) print('True negatives ' + str(truenegatives)) print('False positives ' + str(falsepositives)) print('False negatives ' + str(falsenegatives)) print() print('The average size of the training set was ' + str(averagetrainingsize)) print() precision = truepositives / (truepositives + falsepositives) recall = truepositives / (truepositives + falsenegatives) F1 = 2 * (precision * recall) / (precision + recall) print("F1 : " + str(F1)) coefficientpath = outputpath.replace('.csv', '.coefs.csv') with open(coefficientpath, mode = 'w', encoding = 'utf-8') as f: writer = csv.writer(f) for triple in coefficientuples: coef, normalizedcoef, word = triple writer.writerow([word, coef, normalizedcoef]) return accuracy, allvolumes, coefficientuples