def parse(filename, show=False, external=True): """@TODO Assumes json file is of the form: dict with three entries: 'tests', which is a one-element list of dicts, 'parameters', which is a dict of parameters used to generate the data, and 'data', a dict of relevant statistics regarding the model used to generate the data. In the case of real data, 'parameters' and 'data' will be empty dicts. For 'tests': The inner dict has experiment names as the keys and dicts with various performance metrics as values Args: external: flag for whether this function is being called directly from the command line (ie it is being called by main() below); set this to False if you want to call it from anywhere else Returns: A tuple (names, scores, tests, parameters, data), where: - names is a list of all experiment names in the file, excluding those in HIDDEN; - scores is a list, in the same order of names, of the prediction rates achieved by the experiments; - tests, parameters and data are the corresponding fields taken directly from the json file. """ with open(filename, 'r') as f: data = json.load(f) names = [] scores = [] for test in data[TEST]: for key in sorted(test.keys(), cmp=(lambda x,y: cmp(test[x]['Predicted_Mean'], test[y]['Predicted_Mean']))): if key not in HIDDEN: names.append(str(key)) scores.append(test[key]['Predicted_Mean']) if external: util.figure(figsize=(12,7)) util.plot_dist(scores, labels=names) if data[DATA]: for datum in data[DATA]: if datum == 'sig_words' or datum == 'sig_topics': util.plot(0, label=str(datum) + " " + str(data[DATA][datum])) util.legend(loc='best') if show: util.show() else: util.savefig(filename + '.pdf') return names, scores, data[TEST][0], data[PARAMS], data[DATA]
def showSigTopics(file='lda/trained/'): gamma = get_matrix(file + 'final.gamma') numDocs, numTopics = gamma.shape sigTopicsPerDoc = [] for docNum in range(0, numDocs): doc = np.array(gamma)[docNum].tolist() doc.sort() doc.reverse() total = 0 newDist = [] for i in range(0, len(doc)): total += doc[i] newDist += [doc[i]] newDist = [x/total for x in newDist] cdf = 0 numSigTopics = 0 while (cdf < 0.8): cdf += newDist[numSigTopics] numSigTopics += 1 sigTopicsPerDoc += [numSigTopics] util.plot(sigTopicsPerDoc) pylab.show()