def createTree(topicFiles, distanceFiles): level = len(topicFiles) nodes = [] parent = [] for i in range(0, level): topics = ioFile.load_object(topicFiles[i]) # nodes at the bottom level of the tree if i == 0: [ nodes.append({ "name": ' '.join(topic), "size": 1 }) for topic in topics ] else: pre_nodes = nodes nodes = [] for j in range(0, len(topics)): indexes = np.where(parent == j)[0] children = [] [children.append(pre_nodes[index]) for index in indexes] nodes.append({ "name": ' '.join(topics[j]), "children": children }) if i < level - 1: distances = np.matrix(ioFile.load_object(distanceFiles[i])) parent = np.squeeze(np.array(distances.argmin(1))) root = {"name": '...', "children": nodes} return root
def get_classes(class_mode): if class_mode == 'acm-class': fname = path.join(root_path, 'class_topic', 'acm_class.pkl') elif class_mode == 'arxiv-category': fname = path.join(root_path, 'class_topic', 'arxiv_category.pkl') class_list = ioFile.load_object(fname) return class_list
def statistics_for_class(class_mode, class_name): if class_mode == 'acm-class': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_acm-class.pkl')) elif class_mode == 'arxiv-category': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_arxiv-category.pkl')) clf_topic_stat = [] years = [] for year in range(1993, 2016): try: clf_topic_stat.append(Counter(clf_topic[str(year)][class_name])) years.append(year) except KeyError: print "No documents belonging to %s in %s " % (class_name, year) topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years) topic_bar = graph.createBarChat(topicFiles, clf_topic_stat, years) return topic_bar
def createLink(filenames, topic_num, fun, clf_topic_stat=None): # start index of each year topic_num += 1 node_index = cumsum(topic_num).tolist()[:-1] node_index.insert(0, 0) links = [] i = 0 for fname in filenames: # indexes of first nodes in the graph for year i and i+1 node_index_i = node_index[i] + 1 node_index_j = node_index[i + 1] + 1 # distances between year i and i+1 distances = ioFile.load_object(fname) if fun < 2: N = len(distances) for index_i in range(0, N): clf_topic = np.array(distances[index_i]) if fun == 0: index = np.where(clf_topic < distance_constraint)[0] for index_j in index: links.append({ "source": node_index_i + index_i, "target": node_index_j + index_j, "value": 5 }) elif fun == 1: index = np.where(clf_topic == clf_topic.min())[0][0] links.append({ "source": node_index_i + index_i, "target": node_index_j + index, "value": 5 }) elif fun == 2: for index_i, count in clf_topic_stat[i].iteritems(): clf_topic = np.array(distances[index_i]) index = set(np.where(clf_topic < distance_constraint)[0]) index = index.intersection(set(clf_topic_stat[i + 1].keys())) for index_j in index: links.append({ "source": node_index_i + clf_topic_stat[i].keys().index(index_i), "target": node_index_j + clf_topic_stat[i + 1].keys().index(index_j), "value": 5 }) i += 1 #print "finish creating links" return links
def createBarChat(topicFiles, clf_topic_stat, years): N = len(topicFiles) bar_data = [] for i in range(0, N): topics = ioFile.load_object(topicFiles[i]) clf_topic = clf_topic_stat[i] year = years[i] doc_num, topic_percent = statOfClassification(clf_topic, topics) #print topic_percent bar_data.append({"year": year, "doc": doc_num, "topics": topic_percent}) return bar_data
def topics_for_class(class_mode, class_name, start, end): if class_mode == 'acm-class': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_acm-class.pkl')) elif class_mode == 'arxiv-category': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_arxiv-category.pkl')) clf_topic_stat = [] topic_num = [] years = set(range(start, end+1)) for year in range(start, end+1): try: clf_topic_stat.append(Counter(clf_topic[str(year)][class_name])) topic_num.append(len(set(clf_topic[str(year)][class_name]))) except KeyError: years.remove(year) topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years) clf_topic = path.join(root_path, 'class_topic/distance') distanceFiles = fileSys.traverseDistanceDirectory(clf_topic, list(years)) topic_graph = graph.createGraph(topicFiles, distanceFiles, 2, clf_topic_stat, topic_num) return topic_graph, years
def createTree(topicFiles, distanceFiles): level = len(topicFiles) nodes = [] parent = [] for i in range(0, level): topics = ioFile.load_object(topicFiles[i]) # nodes at the bottom level of the tree if i == 0: [nodes.append({"name": ' '.join(topic), "size": 1}) for topic in topics] else: pre_nodes = nodes nodes = [] for j in range(0, len(topics)): indexes = np.where(parent==j)[0] children = [] [children.append(pre_nodes[index]) for index in indexes] nodes.append({"name": ' '.join(topics[j]), "children": children}) if i < level-1: distances = np.matrix(ioFile.load_object(distanceFiles[i])) parent = np.squeeze(np.array(distances.argmin(1))) root = {"name": '...', "children": nodes} return root
def createBarChat(topicFiles, clf_topic_stat, years): N = len(topicFiles) bar_data = [] for i in range(0, N): topics = ioFile.load_object(topicFiles[i]) clf_topic = clf_topic_stat[i] year = years[i] doc_num, topic_percent = statOfClassification(clf_topic, topics) #print topic_percent bar_data.append({ "year": year, "doc": doc_num, "topics": topic_percent }) return bar_data
def createNode(filenames, clf_topic_stat=None): nodes = [] topic_num = [] i = 0 for fname in filenames: topics = ioFile.load_object(fname) topic_num.append(len(topics)) nodes.append({"name":''}) if clf_topic_stat == None: for topic in topics: nodes.append({"name": ' '.join(topic)}) else: clf_topic = clf_topic_stat[i] for index in clf_topic.keys(): nodes.append({"name": ' '.join(topics[index])}) i += 1 #print "finish creating nodes" return nodes, np.array(topic_num)
def createNode(filenames, clf_topic_stat=None): nodes = [] topic_num = [] i = 0 for fname in filenames: topics = ioFile.load_object(fname) topic_num.append(len(topics)) nodes.append({"name": ''}) if clf_topic_stat == None: for topic in topics: nodes.append({"name": ' '.join(topic)}) else: clf_topic = clf_topic_stat[i] for index in clf_topic.keys(): nodes.append({"name": ' '.join(topics[index])}) i += 1 #print "finish creating nodes" return nodes, np.array(topic_num)
def createLink(filenames, topic_num, fun, clf_topic_stat=None): # start index of each year topic_num += 1 node_index = cumsum(topic_num).tolist()[:-1] node_index.insert(0, 0) links = [] i = 0 for fname in filenames: # indexes of first nodes in the graph for year i and i+1 node_index_i = node_index[i]+1 node_index_j = node_index[i+1]+1 # distances between year i and i+1 distances = ioFile.load_object(fname) if fun < 2: N = len(distances) for index_i in range(0, N): clf_topic = np.array(distances[index_i]) if fun == 0: index = np.where(clf_topic<distance_constraint)[0] for index_j in index: links.append({"source": node_index_i+index_i, "target": node_index_j+index_j, "value": 5}) elif fun == 1: index = np.where(clf_topic==clf_topic.min())[0][0] links.append({"source": node_index_i+index_i, "target": node_index_j+index, "value": 5}) elif fun == 2: for index_i, count in clf_topic_stat[i].iteritems(): clf_topic = np.array(distances[index_i]) index = set(np.where(clf_topic<distance_constraint)[0]) index = index.intersection(set(clf_topic_stat[i+1].keys())) for index_j in index: links.append({"source": node_index_i+clf_topic_stat[i].keys().index(index_i), "target": node_index_j+clf_topic_stat[i+1].keys().index(index_j), "value": 5}) i += 1 #print "finish creating links" return links