def optimize(corpusdir, outputdir): parameters = fit_parameters() acc, n, L = max(parameters, key=lambda r: r[0]) logging.info("Choose parameters: n=%d, l=%d", n, L) logging.disable(logging.DEBUG) authors, scores = create_ranking(n, L) jsonhandler.storeJson(outputdir, jsonhandler.unknowns, authors, scores)
def main(): parser = argparse.ArgumentParser( description="Tira submission for PPM approach (teahan03)") parser.add_argument("-i", action="store", help="path to corpus directory") parser.add_argument("-o", action="store", help="path to output directory") args = vars(parser.parse_args()) corpusdir = args["i"] outputdir = args["o"] if corpusdir == None or outputdir == None: parser.print_help() return jsonhandler.loadJson(corpusdir) global modeldir modeldir = os.path.join(outputdir, MODEL_DIR) if not os.path.exists(modeldir): # os.makedirs(modeldir) createModels() else: loadModels() createAnswers() jsonhandler.storeJson(outputdir, unknowns, authors, scores)
def tira(corpusdir, outputdir): jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() stopwords = open("stopwords.txt") logger.info("Reads in stopwords") for line in stopwords: stopwords = line.split() authors = jsonhandler.candidates tests = jsonhandler.unknowns raw = {} raw_test = {} C = {} C_test = {} Gauthors = {} Gtests = {} for author in authors: logger.info("Reads in text " + str(author) + "...") # raw[author] = open(author,encoding='iso-8859-1') for training in jsonhandler.trainings[author]: newtext = jsonhandler.getTrainingText(author, training) if author in raw.keys(): if len(newtext) > len(raw[author]): raw[author] = newtext else: raw[author] = newtext C[author] = prepareCorpus(raw[author]) logger.info("Calculates Stopword Graph of " + str(author) + "...") Gauthors[author] = getStopWordGraph(C[author], stopwords) for author in tests: logger.info("Reads in test document " + str(author) + "...") # raw[author] = open(author,encoding='iso-8859-1') raw_test[author] = jsonhandler.getUnknownText(author) C_test[author] = prepareCorpus(raw_test[author]) logger.info("Calculates Stopword Graph " + str(author) + "...") Gtests[author] = getStopWordGraph(C_test[author], stopwords) results = [] for testcase in tests: print(testcase) KL = {} Gtst = deepcopy(Gtests[testcase]) Gtst = normalizeWeights(Gtst) for author in authors: logger.info("Calculates KL Divergence of " + str(author) + "...") KL[author] = authorKLdiv(Gauthors[author], Gtst) print(KL) # m = np.argmin(KL) m = min(KL, key=KL.get) results.append((testcase, m)) texts = [text for (text, cand) in results] cands = [cand for (text, cand) in results] jsonhandler.storeJson(outputdir, texts, cands)
def main(): parser = argparse.ArgumentParser(description='keselj03') parser.add_argument('-i', action='store', help='Path to input directory') parser.add_argument('-o', action='store', help='Path to output directory') args = vars(parser.parse_args()) corpusdir = args['i'] outputdir = args['o'] jsonhandler.loadJson(corpusdir) parameters = fit_parameters() acc, n, L = max(parameters, key=lambda r: r[0]) logging.info("Choose parameters: n=%d, l=%d", n, L) #jsonhandler.loadTesting() authors, scores = create_ranking(n, L) jsonhandler.storeJson(outputdir, jsonhandler.unknowns, authors, scores)
def main(corpusdir, outputdir, n_max_feature_number=1000, m_subspace_width=2, mySolver='svd', shrinkage=None): jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() authors = jsonhandler.candidates tests = jsonhandler.unknowns encoding_setting = jsonhandler.encoding global trainSet, testSet texts = [] text_authors = [] test_texts = [] ergebnisList = [] prob_list = [] for i in range(len(authors)): author = authors[i] for text in jsonhandler.trainings[author]: texts.append(jsonhandler.getTrainingText(author, text)) text_authors.append(i) trainSet = getBunchOutRawTrain(texts, text_authors, authors) classifier_set = exhaustiv_disjoint_subspacing(trainSet, n_max_feature_number, m_subspace_width, encoding_setting, mySolver, shrinkage) for t_text in tests: # run classifier for every test text test_texts.append(jsonhandler.getUnknownText(t_text)) proMatrx = mp(n_max_feature_number, m_subspace_width, classifier_set, test_texts) ergebnis = argmax(proMatrx, 1) result_author_list = [] for i in range(len(ergebnis)): result_author_list.append(authors[ergebnis[i]]) prob_list.append(proMatrx[i][ergebnis[i]]) jsonhandler.storeJson(outputdir, tests, result_author_list, prob_list)
def tira(corpusdir, outputdir): """ Keyword arguments: corpusdir -- Path to a tira corpus outputdir -- Output directory """ candidates = jsonhandler.candidates unknowns = jsonhandler.unknowns jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() authors = [] scores = [] for filename in unknowns: ranking = create_author_ranking(candidates, filename, method=dict_entropy) # ranking = create_simple_ranking(candidates, filename, # method=relative_zlib_entropy) author = ranking[0][0] score = 0.5 authors.append(author) scores.append(score) jsonhandler.storeJson(outputdir, unknowns, authors, scores)
def test_method(corpusdir, outputdir, method="d1", n=3, L=2000): logging.info("Test method %s with L=%d", method, L) authors, scores = create_ranking(n, L, method) jsonhandler.storeJson(outputdir, jsonhandler.unknowns, authors, scores)
def tira(corpusdir, outputdir): """ Keyword arguments: corpusdir -- Path to a tira corpus outputdir -- Output directory """ pos_tag = False jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() # creating training data logging.info("Load the training data...") database = Database(150, real_words=True) for candidate in jsonhandler.candidates: author = Author(candidate) for training in jsonhandler.trainings[candidate]: logging.info( "Author '%s': Loading training '%s'", candidate, training) text = Text(jsonhandler.getTrainingText(candidate, training), candidate + " " + training, pos_tag=pos_tag) author.add_text(text) database.add_author(author) database.process() for author in database.authors: author.calc_cmsz(database) # do some training, i.e. check which parameter is best logging.info("Start training...") results = {} for length in range(150, 301, 50): logging.info("Check for %s words.", length) database.considered_words = length database.process() results[length] = 0 for correct_author in database.authors: for training in jsonhandler.trainings[correct_author.name]: trainingcase = Text( jsonhandler.getTrainingText(correct_author.name, training), "Trainingcase", pos_tag=pos_tag) trainingcase.calc_zscores(database) deltas = {} for author in database.authors: deltas[author] = trainingcase.calc_delta(database, author) if min(deltas, key=deltas.get) == correct_author: results[length] += 1 length = max(results, key=results.get) logging.info("Choose %s as length.", str(length)) # reconfigure the database with length database.considered_words = length database.process() # run the testcases results = [] for unknown in jsonhandler.unknowns: testcase = Text(jsonhandler.getUnknownText(unknown), unknown, pos_tag=pos_tag) testcase.calc_zscores(database) deltas = {} for author in database.authors: deltas[author] = testcase.calc_delta(database, author) results.append((unknown, min(deltas, key=deltas.get).name)) texts = [text for (text, candidate) in results] cands = [candidate for (text, candidate) in results] jsonhandler.storeJson(outputdir, texts, cands)
def main(): # parser = argparse.ArgumentParser( description="Tira submission for PPM approach (koppel11)") parser.add_argument("-i", action="store", help="path to corpus directory") parser.add_argument("-o", action="store", help="path to output directory") args = vars(parser.parse_args()) corpusdir = args["i"] outputdir = args["o"] if corpusdir == None or outputdir == None: parser.print_help() return candidates = jsonhandler.candidates unknowns = jsonhandler.unknowns jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() texts = {} # texts = frozenset() would this work?? corpus = "" print("loading texts for training") deletes = [] for cand in candidates: texts[cand] = "" for file in jsonhandler.trainings[cand]: texts[cand] += jsonhandler.getTrainingText(cand, file) # if frozenset() is used: # texts.add(jsonhandler.getTrainingText(cand, file)) print("text " + file + " read") if len(texts[cand].split()) < mintrainlen: del texts[cand] deletes.append(cand) else: corpus += texts[cand] newcands = [] for cand in candidates: if cand not in deletes: newcands.append(cand) candidates = newcands words = [len(texts[cand].split()) for cand in texts] minwords = min(words) print(minwords) fl = training(corpus) authors = [] scores = [] for file in unknowns: print("testing " + file) utext = jsonhandler.getUnknownText(file) ulen = len(utext.split()) if ulen < minlen: authors.append("None") scores.append(0) else: wins = [0] * len(candidates) textlen = min(ulen, minwords) print(textlen) ustring = "".join(utext.split()[:textlen]) for i in range(repetitions): rfl = random.sample(fl, len(fl) // 2) sims = [] for cand in candidates: candstring = getRandomString(texts[cand], textlen) sims.append(testSim(candstring, ustring, rfl, 1)) wins[sims.index(max(sims))] += 1 score = max(wins) / float(repetitions) if score >= threshold: authors.append(candidates[wins.index(max(wins))]) scores.append(score) else: authors.append("None") scores.append(score) print("storing answers") jsonhandler.storeJson(outputdir, unknowns, authors, scores)
def tira(corpusdir, outputdir): # load training data jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() database = Database() for candidate in jsonhandler.candidates: database.add_author(candidate) for training in jsonhandler.trainings[candidate]: logging.info( "Reading training text '%s' of '%s'", training, candidate) text = Text(jsonhandler.getTrainingText(candidate, training), candidate + " " + training) try: text.create_chunks() database.add_text(candidate, text) except: # logging.info("Text size too small. Skip this text.") logging.warning("Text too small. Exit.") sys.exit() database.calc_initial_feature_set() candidates = [] # this list shall contain the most likely candidates # We use the unmasking procedure to compare all unknown texts to all # enumerated texts of known authorship and then decide which fit best. # runtime could surely be optimized for unknown in jsonhandler.unknowns: try: results = {} # dictionary containing the maximum difference (first and # last iteration) for every author # load the unknown text and create the chunks which are used # for the unmasking process unknown_text = Text(jsonhandler.getUnknownText(unknown), unknown) unknown_text.create_chunks() for candidate in jsonhandler.candidates: results[candidate] = float("inf") for known_text in database.texts[candidate]: # reset the feature list, i.e. create a copy of the initial # list features = list(database.initial_feature_set) # randomly select equally many chunks from each text select_chunks(unknown_text, known_text) # create label vector # (0 -> chunks of unknown texts, 1 -> chunks of known texts) label = [0 for i in range(0, len(unknown_text.selected_chunks))] + \ [1 for i in range( 0, len(known_text.selected_chunks))] label = numpy.array(label) # the reshape is necessary for the classifier label.reshape( len(unknown_text.selected_chunks) + len(known_text.selected_chunks), 1) # loop global NUMBER_ITERATIONS global NUMBER_ELIMINATE_FEATURES scores = [] for i in range(0, NUMBER_ITERATIONS): logging.info("Iteration #%s for texts '%s' and '%s'", str(i + 1), unknown, known_text.name) # Create the matrix containing the relative word counts # in each chunk (for the selected features) matrix = [[chunk.count(word) / CHUNK_LENGTH for word in features] for chunk in (unknown_text.selected_chunks + known_text.selected_chunks)] matrix = numpy.array(matrix) # Get a LinearSVC classifier and its score (i.e. accuracy # in the training data). Save this score as a point in the # scores curve. (We want to select the curve with the # steepest decrease) classifier = svm.LinearSVC() classifier.fit(matrix, label) scores.append(classifier.score(matrix, label)) # a list of all feature weights flist = classifier.coef_[0] # Now, we have to delete the strongest weighted features # (NUMBER_ELIMINATE_FEATURES) from each side. # indices of maximum 3 values and minimum 3 values delete = list(numpy.argsort(flist)[-NUMBER_ELIMINATE_FEATURES:]) \ + list(numpy.argsort(flist)[ :NUMBER_ELIMINATE_FEATURES]) # We cannot directly use the delete list to eliminate from # the features list since peu-a-peu elimination changes # the indices. delete_features = [] for i in delete: delete_features.append(features[i]) logging.info("Delete %s", str(delete_features)) for feature in delete_features: # a single feature could appear twice in the delete # list if feature in features: features.remove(feature) # The scores list is now the graph we use to get our results # Therefore, compare with previous scores. score = curve_score(scores) logging.info("Calculated a score of %s", str(score)) if score < results[candidate]: results[candidate] = score # Which author has the biggest score? most_likely_author = min(results, key=results.get) logging.info("Most likely author is '%s' with a score of %s", most_likely_author, results[most_likely_author]) candidates.append(most_likely_author) except: candidates.append("FILE_TO_SMALL") # save everything in the specified directory jsonhandler.storeJson(outputdir, jsonhandler.unknowns, candidates)
def tira(corpusdir, outputdir): """ Keyword arguments: corpusdir -- Path to a tira corpus outputdir -- Output directory """ pos_tag = False jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() # creating training data logging.info("Load the training data...") database = Database(150, real_words=True) for candidate in jsonhandler.candidates: author = Author(candidate) for training in jsonhandler.trainings[candidate]: logging.info("Author '%s': Loading training '%s'", candidate, training) text = Text(jsonhandler.getTrainingText(candidate, training), candidate + " " + training, pos_tag=pos_tag) author.add_text(text) database.add_author(author) database.process() for author in database.authors: author.calc_cmsz(database) # do some training, i.e. check which parameter is best logging.info("Start training...") results = {} for length in range(150, 301, 50): logging.info("Check for %s words.", length) database.considered_words = length database.process() results[length] = 0 for correct_author in database.authors: for training in jsonhandler.trainings[correct_author.name]: trainingcase = Text(jsonhandler.getTrainingText( correct_author.name, training), "Trainingcase", pos_tag=pos_tag) trainingcase.calc_zscores(database) deltas = {} for author in database.authors: deltas[author] = trainingcase.calc_delta(database, author) if min(deltas, key=deltas.get) == correct_author: results[length] += 1 length = max(results, key=results.get) logging.info("Choose %s as length.", str(length)) # reconfigure the database with length database.considered_words = length database.process() # run the testcases results = [] for unknown in jsonhandler.unknowns: testcase = Text(jsonhandler.getUnknownText(unknown), unknown, pos_tag=pos_tag) testcase.calc_zscores(database) deltas = {} for author in database.authors: deltas[author] = testcase.calc_delta(database, author) results.append((unknown, min(deltas, key=deltas.get).name)) texts = [text for (text, candidate) in results] cands = [candidate for (text, candidate) in results] jsonhandler.storeJson(outputdir, texts, cands)
def tira(corpusdir, outputdir): # load training data jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() database = Database() for candidate in jsonhandler.candidates: database.add_author(candidate) for training in jsonhandler.trainings[candidate]: logging.info("Reading training text '%s' of '%s'", training, candidate) text = Text(jsonhandler.getTrainingText(candidate, training), candidate + " " + training) try: text.create_chunks() database.add_text(candidate, text) except: # logging.info("Text size too small. Skip this text.") logging.warning("Text too small. Exit.") sys.exit() database.calc_initial_feature_set() candidates = [] # this list shall contain the most likely candidates # We use the unmasking procedure to compare all unknown texts to all # enumerated texts of known authorship and then decide which fit best. # runtime could surely be optimized for unknown in jsonhandler.unknowns: try: results = {} # dictionary containing the maximum difference (first and # last iteration) for every author # load the unknown text and create the chunks which are used # for the unmasking process unknown_text = Text(jsonhandler.getUnknownText(unknown), unknown) unknown_text.create_chunks() for candidate in jsonhandler.candidates: results[candidate] = float("inf") for known_text in database.texts[candidate]: # reset the feature list, i.e. create a copy of the initial # list features = list(database.initial_feature_set) # randomly select equally many chunks from each text select_chunks(unknown_text, known_text) # create label vector # (0 -> chunks of unknown texts, 1 -> chunks of known texts) label = [0 for i in range(0, len(unknown_text.selected_chunks))] + \ [1 for i in range( 0, len(known_text.selected_chunks))] label = numpy.array(label) # the reshape is necessary for the classifier label.reshape( len(unknown_text.selected_chunks) + len(known_text.selected_chunks), 1) # loop global NUMBER_ITERATIONS global NUMBER_ELIMINATE_FEATURES scores = [] for i in range(0, NUMBER_ITERATIONS): logging.info("Iteration #%s for texts '%s' and '%s'", str(i + 1), unknown, known_text.name) # Create the matrix containing the relative word counts # in each chunk (for the selected features) matrix = [[ chunk.count(word) / CHUNK_LENGTH for word in features ] for chunk in (unknown_text.selected_chunks + known_text.selected_chunks)] matrix = numpy.array(matrix) # Get a LinearSVC classifier and its score (i.e. accuracy # in the training data). Save this score as a point in the # scores curve. (We want to select the curve with the # steepest decrease) classifier = svm.LinearSVC() classifier.fit(matrix, label) scores.append(classifier.score(matrix, label)) # a list of all feature weights flist = classifier.coef_[0] # Now, we have to delete the strongest weighted features # (NUMBER_ELIMINATE_FEATURES) from each side. # indices of maximum 3 values and minimum 3 values delete = list(numpy.argsort(flist)[-NUMBER_ELIMINATE_FEATURES:]) \ + list(numpy.argsort(flist)[ :NUMBER_ELIMINATE_FEATURES]) # We cannot directly use the delete list to eliminate from # the features list since peu-a-peu elimination changes # the indices. delete_features = [] for i in delete: delete_features.append(features[i]) logging.info("Delete %s", str(delete_features)) for feature in delete_features: # a single feature could appear twice in the delete # list if feature in features: features.remove(feature) # The scores list is now the graph we use to get our results # Therefore, compare with previous scores. score = curve_score(scores) logging.info("Calculated a score of %s", str(score)) if score < results[candidate]: results[candidate] = score # Which author has the biggest score? most_likely_author = min(results, key=results.get) logging.info("Most likely author is '%s' with a score of %s", most_likely_author, results[most_likely_author]) candidates.append(most_likely_author) except: candidates.append("FILE_TO_SMALL") # save everything in the specified directory jsonhandler.storeJson(outputdir, jsonhandler.unknowns, candidates)
def heavy_lifter(corpusdir, outputdir): # load training data jsonhandler.loadJson(corpusdir) # 现在吃进meta-file.json: corpusdir, upath, candidates, unknowns, encoding, language # corpusdir是meta-file.json所在文件夹 ,upath是"unknown"文件夹的位置,也就是放unknown texts的地方 # unknowns是unknowns-texts的文件名,ie. "unknown00011.txt" jsonhandler.loadTraining() # 现在trainings(dict)有内容了,ie:{...'candidate00010': ['known00001.txt', 'known00002.txt']} database = Database( ) # database.authors = [], database.texts = {}, database.features = {} for candidate in jsonhandler.candidates: database.add_author( candidate ) # database.authors = [cand1, cand2...]// database.texts = {author:[]...} for training in jsonhandler.trainings[ candidate]: # 对于某个candidate的training text的文件名 # logging.info("Reading training text '%s' of '%s'", training, candidate) text = Text( jsonhandler.getTrainingText( candidate, training), # 创建Text实例 text(raw, name) candidate + " " + training) # name类似于 "candidate00001 known00002.txt" try: text.chunks_generator( ) # Text.chunks是一个list of lists(由许多chunks组成,每个chunk里是tokens列表) text.ngrams_generator() # 生成text实例的ngram database.add_text( candidate, text ) # 为database实例加上texts属性 {candidate00001: [text, text,...]} except: # logging.info("Text size too small. Skip this text.") # logging.warning("Text too small. Exit.") # sys.exit() print('Something went wrong.') database.feature_generator() # We use the unmasking procedure to compare all unknown texts to all # enumerated texts of known authorship and then decide which fit best. for unknown in jsonhandler.unknowns: results = { } # dictionary containing the average ten folds accuracy score of each candidates dropped_features = {} # load the unknown text and create the chunks which are used for the unmasking process unknown_text = Text(jsonhandler.getUnknownText(unknown), unknown) unknown_text.chunks_generator( ) # 得到unknown_text.tokens (list) 和 .chunks (list of lists) for candidate in jsonhandler.candidates: # print(candidate) # results是总的dict,每个key所对应的value是个list,包含了5个子list # 实验要进行5轮,每轮都是10折 results[candidate] = [] dropped_features[candidate] = [] # features = (database.features[candidate]).copy() for known_text in database.texts[candidate]: # experiment_scores = [] # randomly select equally many chunks from each text # 重复5次实验,每次都随机删掉多余的chunk for experiment in range(0, NUMBER_EXPERIMENTS): features = (database.features[candidate]).copy() select_chunks(unknown_text, known_text) # (dropped_features[candidate])[experiment] = [] # create label vector # (0 for chunks of unknown texts, 1 for chunks of known texts) label = [ 0 for i in range(0, len(unknown_text.selected_chunks)) ] + [1 for i in range(0, len(known_text.selected_chunks))] label = np.array(label) # the reshape is necessary for the classifier label.reshape( len(unknown_text.selected_chunks) + len(known_text.selected_chunks), 1) global NUMBER_ITERATIONS, FOLD_ELIMINATION fold_scores = [] for i in range(0, NUMBER_ITERATIONS): # logging.info("Iteration #%s for texts '%s' and '%s'", # str(i + 1), unknown, known_text.name) # (dropped_features[candidate])[i] = [] matrix = [[ chunk[0].count(token) / len(chunk2tokens(chunk[0])) for token in features ] for chunk in (unknown_text.selected_chunks + known_text.selected_chunks)] matrix = np.array(matrix) print( "进行", candidate, "的第" + str(experiment + 1) + "个实验的第" + str(i + 1) + "轮。", '\n') classifier = svm.LinearSVC() # classifier = svm.SVC(kernel='linear') classifier.fit(matrix, label) fold_scores.append(classifier.score(matrix, label)) weights = classifier.coef_[0] delete = list(np.argsort(weights)[-FOLD_ELIMINATION:]) \ + list(np.argsort(weights)[:FOLD_ELIMINATION]) # We cannot directly use the delete list to eliminate from # the features list since peu-a-peu elimination changes the indices. delete_features = [] for x in delete: delete_features.append(features[x]) # ((dropped_features[candidate])[experiment]).append(delete_features) (dropped_features[candidate]).append(delete_features) # logging.info("Delete %s", str(delete_features)) for feature in delete_features: if feature in features: features.remove(feature) # experiment_scores.append(fold_scores) # # experiment_scores = np.array(experiment_scores) # experiment_scores = np.mean(experiment_scores, axis=0) (results[candidate]).append(fold_scores) # save everything in the specified directory jsonhandler.storeJson(outputdir, results, dropped_features)
def main(): """The main function.""" parser = argparse.ArgumentParser( description="PPM approach according to Koppel11" ) parser.add_argument("-i", action="store", help="path to corpus directory") parser.add_argument("-o", action="store", help="path to output directory") args = vars(parser.parse_args()) corpusdir = args["i"] outputdir = args["o"] if corpusdir is None or outputdir is None: parser.print_help() return candidates = jsonhandler.candidates unknowns = jsonhandler.unknowns jsonhandler.loadJson(corpusdir) jsonhandler.loadTraining() texts = {} corpus = "" print("Loading texts for training...") deletes = [] for cand in candidates: texts[cand] = "" for file in jsonhandler.trainings[cand]: texts[cand] += jsonhandler.getTrainingText(cand, file) print(f"Text {file} read") if len(texts[cand].split()) < MINTRAINLEN: del texts[cand] deletes.append(cand) else: corpus += texts[cand] newcands = [] for cand in candidates: if cand not in deletes: newcands.append(cand) candidates = newcands words = [len(texts[cand].split()) for cand in texts] minwords = min(words) print(minwords) feature_list = training(corpus) authors = [] scores = [] for file in unknowns: print(f"Testing {file}") utext = jsonhandler.getUnknownText(file) ulen = len(utext.split()) if ulen < MINLEN: authors.append("None") scores.append(0) else: wins = [0] * len(candidates) textlen = min(ulen, minwords) print(textlen) ustring = "".join(utext.split()[:textlen]) for _ in range(REPETITIONS): rfl = random.sample(feature_list, len(feature_list) // 2) sims = [] for cand in candidates: candstring = get_random_string(texts[cand], textlen) sims.append(test_sim(candstring, ustring, rfl, 1)) wins[sims.index(max(sims))] += 1 score = max(wins) / float(REPETITIONS) if score >= THRESHOLD: authors.append(candidates[wins.index(max(wins))]) scores.append(score) else: authors.append("None") scores.append(score) print("Storing answers...") jsonhandler.storeJson(outputdir, unknowns, authors, scores) print("Done!")