Esempio n. 1
0
def indexing(del_nodes_count=1, ignored_tags=None):
    if not ignored_tags:
        ignored_tags = read_ignore_tags()

    # Build all tags list
    all_tags = [
        tag for item in read_corpus() for tag in item['tags']
        if tag not in ignored_tags
    ]

    # Build index tags
    # indexing_tags = tuple(set(all_tags))
    # all_tags = [indexing_tags.index(tag) for tag in all_tags]

    # counting words occurrences
    nodes_dict_all = {i: all_tags.count(i) for i in set(all_tags)}

    # filtering by occurrences count
    nodes_dict = {
        k: v
        for k, v in nodes_dict_all.items() if v > del_nodes_count
    }

    formatted_tags = {
        (tag1, tag2): [0, []]
        for tag1, tag2 in itertools.combinations(set(nodes_dict.keys()), 2)
    }
    print(len(formatted_tags))

    # count tags connection
    for item in read_corpus():
        for tag1, tag2 in itertools.combinations(item['tags'], 2):

            if (tag1, tag2) in formatted_tags.keys():
                formatted_tags[(tag1, tag2)][0] += 1
                formatted_tags[(tag1, tag2)][1].append(item['id'])

    # filtering pairs with zero count
    for k, v in formatted_tags.copy().items():
        if v[0] < 2:
            del formatted_tags[k]

    with open(TWIN_TAGS_INDEX_FILE, 'wb') as in_file:
        pickle.dump(formatted_tags, in_file)
Esempio n. 2
0
def create_graph2():
    ignored_tags = read_ignore_tags()

    corpus = list(read_corpus())
    corpus_dict = {item['id']: item for item in corpus}

    # Соберем все возможные теги
    all_tags = list(
        set(tag for item in corpus for tag in item['tags']
            if tag not in ignored_tags))

    # Индекс {тег} => [статья, ...]
    tags_corpus_index = {tag: set() for tag in all_tags}
    for idx, item in enumerate(corpus):
        for tag in item['tags']:
            tag in tags_corpus_index and tags_corpus_index[tag].add(item['id'])

    # Удалим теги с всего одной статьей
    tags_corpus_index = {
        k: v
        for k, v in tags_corpus_index.items() if len(v) > 1
    }

    combinations = dict()

    for items_idx in tags_corpus_index.values():
        for _combination in [
                tuple(sorted(i)) for i in itertools.combinations(items_idx, 2)
        ]:
            if _combination in combinations:
                combinations[_combination] += 1
            else:
                combinations[_combination] = 1

    a = [(*k, v) for k, v in combinations.items()]
    # Create graph
    g: Graph = igraph.Graph.TupleList(a,
                                      vertex_name_attr='id',
                                      directed=False,
                                      weights=True)
    print(g.vcount())
    print(g.ecount())

    # es = g.es.find(weight_gt=2)
    # print([tag for tag in corpus_dict[g.vs[es.source]['id']]['tags'] if tag not in ignored_tags])
    # print([tag for tag in corpus_dict[g.vs[es.target]['id']]['tags'] if tag not in ignored_tags])
    # # print(g.es.find(weight_gt=2))
    return g
Esempio n. 3
0
def create_graph():
    ignored_tags = read_ignore_tags()

    corpus = list(read_corpus())
    corpus_dict = {item['id']: item for item in corpus}

    # Create graph
    g: Graph = igraph.Graph(directed=False)

    # Add 5 vertices
    g.add_vertices(len(corpus))

    # Add ids and labels to vertices
    for idx, item in enumerate(corpus):
        g.vs[idx]["id"] = item['id']

    all_tags = list(
        set(tag for item in corpus for tag in item['tags']
            if tag not in ignored_tags))

    tags_corpus_index = {tag: set() for tag in all_tags}

    for idx, item in enumerate(corpus):
        for tag in item['tags']:
            tag in tags_corpus_index and tags_corpus_index[tag].add(item['id'])

    combinations = set()

    tags_corpus_index = {
        k: v
        for k, v in tags_corpus_index.items() if len(v) > 1
    }

    for items_idx in tags_corpus_index.values():
        sources = g.vs.select(id_in=items_idx)
        combinations.update(
            itertools.combinations([source.index for source in sources], 2))

    g.add_edges(combinations)
    print(g.vcount())
    print(g.ecount())
    print(g.es[g.ecount() - 1])
    return g
Esempio n. 4
0
def print_corpus(template: str):
    [print(template.format(**out)) for out in read_corpus()]
Esempio n. 5
0
from urllib.parse import urlunsplit, urlsplit, parse_qs

import tornado.ioloop
import tornado.web
import tornado.httpserver

from . import ui_modules
from .handlers.article_tags_graph import ArticleTagsGraphHandler
from .handlers.base import BaseHandler
from .handlers.twin_tags_graph import TwinTagsGraphHandler
from graph_libs.article_tags_graph import read_graph
from graph_libs.corpus import read_corpus, read_ignore_tags
from graph_libs.twin_tags_graph import read_twin_tags_index

corpus = {item['id']: item for item in read_corpus()}
twin_tags_index = read_twin_tags_index()

settings = {
    "ui_modules": ui_modules,
}


class IndexRedirectHandler(BaseHandler):
    def get(self):
        base_url = urlsplit(self.request.full_url())
        location = urlunsplit(
            (base_url.scheme, base_url.netloc, '/article-tags-graph/', '', ''))
        self.set_header('X-VF-Staging-Redirect', location)
        self.redirect(location)

Esempio n. 6
0
    def corpus(self):
        if self.__corpus is None:
            self.__corpus = list(read_corpus())

        return self.__corpus