def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun): probFiles = fileSys.traverseDirectory(probDir) topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1) classFiles = fileSys.traverseDirectory(classDir) N = len(probFiles) if len(topicFiles) != N or len(classFiles) != N: print "numbers of files are not same" sys.exit('System will exit') all_clf_topic = {} if fun == 0: irange = range(0, N) # acm-class start from 1998 elif fun == 1: irange = range(5, N) for i in irange: prob = ioFile.load_object(probFiles[i]) topics = ioFile.load_object(topicFiles[i]) inFile = ioFile.dataFromFile(classFiles[i]) year = probFiles[i][-8:-4] topic_index = np.squeeze(np.array(prob.argmax(1))) doc_topic = topic_index #doc_topic = [] #[doc_topic.append(' '.join(topics[index])) for index in topic_index] all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun) clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun) all_clf_topic[year] = clf_topic return all_clf_topic
def topics_for_year(year): topicFiles = fileSys.traverseDirectory(path.join(root_path, 'lda_model', str(year), 'topic')) distanceFiles = fileSys.traverseDirectory(path.join(root_path, 'lda_model', str(year), 'distance')) topic_tree = graph.createTree(topicFiles, distanceFiles) return topic_tree
def topics_for_cs(): topicFiles = fileSys.traverseDirectory(path.join(root_path, 'htm/topic')) distanceFiles = fileSys.traverseDirectory(path.join(root_path, 'htm/distance')) topic_tree = graph.createTree(topicFiles, distanceFiles) return topic_tree