def indexing(del_nodes_count=1, ignored_tags=None): if not ignored_tags: ignored_tags = read_ignore_tags() # Build all tags list all_tags = [ tag for item in read_corpus() for tag in item['tags'] if tag not in ignored_tags ] # Build index tags # indexing_tags = tuple(set(all_tags)) # all_tags = [indexing_tags.index(tag) for tag in all_tags] # counting words occurrences nodes_dict_all = {i: all_tags.count(i) for i in set(all_tags)} # filtering by occurrences count nodes_dict = { k: v for k, v in nodes_dict_all.items() if v > del_nodes_count } formatted_tags = { (tag1, tag2): [0, []] for tag1, tag2 in itertools.combinations(set(nodes_dict.keys()), 2) } print(len(formatted_tags)) # count tags connection for item in read_corpus(): for tag1, tag2 in itertools.combinations(item['tags'], 2): if (tag1, tag2) in formatted_tags.keys(): formatted_tags[(tag1, tag2)][0] += 1 formatted_tags[(tag1, tag2)][1].append(item['id']) # filtering pairs with zero count for k, v in formatted_tags.copy().items(): if v[0] < 2: del formatted_tags[k] with open(TWIN_TAGS_INDEX_FILE, 'wb') as in_file: pickle.dump(formatted_tags, in_file)
def create_graph2(): ignored_tags = read_ignore_tags() corpus = list(read_corpus()) corpus_dict = {item['id']: item for item in corpus} # Соберем все возможные теги all_tags = list( set(tag for item in corpus for tag in item['tags'] if tag not in ignored_tags)) # Индекс {тег} => [статья, ...] tags_corpus_index = {tag: set() for tag in all_tags} for idx, item in enumerate(corpus): for tag in item['tags']: tag in tags_corpus_index and tags_corpus_index[tag].add(item['id']) # Удалим теги с всего одной статьей tags_corpus_index = { k: v for k, v in tags_corpus_index.items() if len(v) > 1 } combinations = dict() for items_idx in tags_corpus_index.values(): for _combination in [ tuple(sorted(i)) for i in itertools.combinations(items_idx, 2) ]: if _combination in combinations: combinations[_combination] += 1 else: combinations[_combination] = 1 a = [(*k, v) for k, v in combinations.items()] # Create graph g: Graph = igraph.Graph.TupleList(a, vertex_name_attr='id', directed=False, weights=True) print(g.vcount()) print(g.ecount()) # es = g.es.find(weight_gt=2) # print([tag for tag in corpus_dict[g.vs[es.source]['id']]['tags'] if tag not in ignored_tags]) # print([tag for tag in corpus_dict[g.vs[es.target]['id']]['tags'] if tag not in ignored_tags]) # # print(g.es.find(weight_gt=2)) return g
def create_graph(): ignored_tags = read_ignore_tags() corpus = list(read_corpus()) corpus_dict = {item['id']: item for item in corpus} # Create graph g: Graph = igraph.Graph(directed=False) # Add 5 vertices g.add_vertices(len(corpus)) # Add ids and labels to vertices for idx, item in enumerate(corpus): g.vs[idx]["id"] = item['id'] all_tags = list( set(tag for item in corpus for tag in item['tags'] if tag not in ignored_tags)) tags_corpus_index = {tag: set() for tag in all_tags} for idx, item in enumerate(corpus): for tag in item['tags']: tag in tags_corpus_index and tags_corpus_index[tag].add(item['id']) combinations = set() tags_corpus_index = { k: v for k, v in tags_corpus_index.items() if len(v) > 1 } for items_idx in tags_corpus_index.values(): sources = g.vs.select(id_in=items_idx) combinations.update( itertools.combinations([source.index for source in sources], 2)) g.add_edges(combinations) print(g.vcount()) print(g.ecount()) print(g.es[g.ecount() - 1]) return g
def print_corpus(template: str): [print(template.format(**out)) for out in read_corpus()]
from urllib.parse import urlunsplit, urlsplit, parse_qs import tornado.ioloop import tornado.web import tornado.httpserver from . import ui_modules from .handlers.article_tags_graph import ArticleTagsGraphHandler from .handlers.base import BaseHandler from .handlers.twin_tags_graph import TwinTagsGraphHandler from graph_libs.article_tags_graph import read_graph from graph_libs.corpus import read_corpus, read_ignore_tags from graph_libs.twin_tags_graph import read_twin_tags_index corpus = {item['id']: item for item in read_corpus()} twin_tags_index = read_twin_tags_index() settings = { "ui_modules": ui_modules, } class IndexRedirectHandler(BaseHandler): def get(self): base_url = urlsplit(self.request.full_url()) location = urlunsplit( (base_url.scheme, base_url.netloc, '/article-tags-graph/', '', '')) self.set_header('X-VF-Staging-Redirect', location) self.redirect(location)
def corpus(self): if self.__corpus is None: self.__corpus = list(read_corpus()) return self.__corpus