Beispiel #1
0
from similarity.service import SimilarityService, ResourceSimilarityHandler
from similarity.service.similarity_handler import SimilarityHandler
from summary.service import SummaryHandler
from tagger.config import load, config
from tagger.service import DocumentsHandler, ContentStoreService, DocumentProcessor, DocumentHandler
from tornado.httpserver import HTTPServer
from tornado.ioloop import IOLoop
from tornado.web import Application
from trinity import Logger
from web import ConfigHandler
from web import StatusHandler
from web import VersionHandler

logger = Logger.get_logger("TrinityApp")


class TrinityApp(Application):
    def __init__(self, config_file):
        self.config_file = config_file
        load(config_file)

        content_store_service = ContentStoreService()
        handlers = [(
            '/tagger/document',
            DocumentHandler,
            {
                "content_store_service": content_store_service,
                "document_processor": DocumentProcessor(content_store_service)
            },
        ),
                    (
Beispiel #2
0
import json
import os
from copy import deepcopy

import yaml
from trinity import Logger

logger = Logger.get_logger("Config")


class _Config:
    """
        TODO: Pramod - Refactor and simplify the process of making config a <em>singleton</em>
    """
    DEFAULT_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "../config/sample.yml")
    _ = {}

    def load(self, config_file):
        default_config = open(self.DEFAULT_CONFIG_FILE, "r")
        self._ = yaml.load(default_config)
        if config_file is not None and os.path.isfile(config_file):
            override_config = open(config_file, "r")
            overriden_configs = yaml.load(override_config)
            if not overriden_configs:
                logger.info("Empty overrides")
                return
            logger.info("Overriding Config Values")
            logger.info(json.dumps(overriden_configs, indent=4))
            self._ = merge_dictionaries(self._, overriden_configs)
        logger.info("Full config")
        logger.info(json.dumps(self._, indent=4))
Beispiel #3
0
import os
import threading
from collections import defaultdict

import cPickle
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from lockfile import FileLock
from trinity import Logger

logger = Logger.get_logger("LDATagger")


class LDATagger:
    _lda_model = None
    _dictionary = None
    _lda_model_path = None
    _dictionary_path = None
    DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model")
    DEFAULT_NUM_TOPICS = 1000

    def __init__(self,
                 model_path=DEFAULT_MODEL_PATH,
                 num_topics=DEFAULT_NUM_TOPICS,
                 lock=threading.Lock()):
        self.save_model_lock = lock

        if os.path.isfile(model_path):
            raise Exception("Invalid Model Path; Should Be a Directory")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
Beispiel #4
0
from math import log

from stats import distance_deviation
from trinity import Logger

logger = Logger.get_logger("Summarizer")


class Summarizer:
    ALPHA = 1
    BETA = 1
    GAMMA = 2
    DEFAULT_COMPRESSION = 10
    NUMBER_OF_WEIGHTING_MEASURES = 3.0

    def __init__(self, compression_ratio=DEFAULT_COMPRESSION):
        self.compression_ratio = compression_ratio

    def summary_length(self, processed_document):
        compressed_length_based_on_document = processed_document.number_of_sentences() * self.compression_ratio / 100
        if (compressed_length_based_on_document == 0): return processed_document.number_of_sentences()
        return compressed_length_based_on_document

    def summarize_using_weighing_measures(self, processed_document):
        number_of_sentences_in_summary = self.summary_length(processed_document)
        summary_sentence_ids = []
        for i in range(0, number_of_sentences_in_summary):
            highest_weighed_sentence_number = self.highest_weighed_sentence_id(processed_document)
            summary_sentence_ids.append(highest_weighed_sentence_number)
            processed_document.eliminate_processed_sentence(highest_weighed_sentence_number)
        summary = [processed_document.sentence(sentence_number) for sentence_number in sorted(summary_sentence_ids)]
from collections import defaultdict

from stats import jaccard_coefficient, CosineSimilarity
from summary.core import TokensSpace
from text import TextProcessor
from trinity import Logger

logger = Logger.get_logger("ResourceSimilarityService")


def similarity_co_efficient((value1, value2), text_processor=TextProcessor()):
    a_text_value = value1["value"]
    other_text_value = value2["value"]
    if (not (isinstance(a_text_value, str) or isinstance(
            a_text_value, unicode) or isinstance(a_text_value, list))
        ) or (not (isinstance(other_text_value, str) or isinstance(
            other_text_value, unicode) or isinstance(other_text_value, list))):
        return 0
    if not isinstance(a_text_value, list):
        a_text_value = text_processor.tokenize(value1["value"])
    if not isinstance(other_text_value, list):
        other_text_value = text_processor.tokenize(value2["value"])

    return _co_efficient_calculation_mechanism(a_text_value)(a_text_value,
                                                             other_text_value)


def _co_efficient_calculation_mechanism(field_value,
                                        TOKENS_LENGTH_THRESHOLD=10):
    return jaccard_coefficient if len(
        field_value) < TOKENS_LENGTH_THRESHOLD else cosine_similarity
Beispiel #6
0
import json

import httplib
import requests
from tagger.config import config
from tagger.service.contracts import DocumentsResponse, DocumentTopicsMixturesRequest, DocumentsTagsRequest, \
    DocumentResponse
from tagger.service.contracts.document_topics_mixture_requests import DocumentTopicsMixtureRequest
from tagger.service.contracts.documents_tags_request import DocumentTagsRequest
from trinity import Logger

logger = Logger.get_logger("ContentStoreService")


class ContentStoreService:
    HEADERS = {'Content-Type': 'application/json'}
    CONNECTION_ERROR = "Connection Error While Fetching"
    STATUS_FAILED = "Failed. Status Not Ok"

    def fetch_documents(self, documents_request):
        get_docs_url = config("content_store.host") + config(
            "content_store.get_docs")
        response = None
        try:
            logger.info("Fetching %s Documents to Tag from %s" %
                        (documents_request, get_docs_url))
            response = requests.get(get_docs_url,
                                    data=documents_request,
                                    headers=self.HEADERS)
            if response.status_code is not httplib.OK:
                logger.error(response.text)
Beispiel #7
0
import json

import requests
from similarity.service.resource_similarity_service import all_resources_similarity
from tornado.web import RequestHandler
from trinity import Logger
from trinity.contracts import Response
from util import Unblock

logger = Logger.get_logger("ResourceSimilarityHandler")


class ResourceSimilarityHandler(RequestHandler):
    def initialize(self, similarity_threshold):
        self.similarity_threshold = similarity_threshold

    @Unblock()
    def post(self):
        """
            Accepts a request with json as below:
            {
                "resources" : [resource1,resource2..]
                "callback_url": "callback url"
            }

            resource = {
                "id": sha123,
                "field1" : {
                    "value": bla,
                    "weight" : optional
                },
Beispiel #8
0
import json
import os

import httplib
from tagger.config import config
from tagger.core import LDATagger
from tornado.web import RequestHandler
from trinity import Logger
from trinity.contracts import Response

logger = Logger.get_logger("DocumentsHandler")


class DocumentHandler(RequestHandler):
    def initialize(self, document_processor, content_store_service):
        self.document_processor = document_processor
        self.content_store_service = content_store_service

    def post(self):

        document_id = json.loads(self.request.body)["documentId"]
        logger.info("Request to infer topics for document %s received" %
                    document_id)

        try:
            document_response = self.content_store_service.fetch_document(
                document_id)
        except Exception as e:
            document_fetch_error = "unable to fetch Document for Tagging"
            logger.info(e)
            logger.error("%s for Id %s" % (document_fetch_error, document_id))
Beispiel #9
0
 def __init__(self):
     self.logger = Logger.get_logger("Unblock")
     self.executor = ThreadPoolExecutor(max_workers=100)
     self.logger.info("Created a ThreadPool")
import json
import os
import os.path as os_path
import shutil
import unittest

from tagger.config import load, config
from tagger.core import LDATagger
from tagger.service.contracts import DocumentsResponse
from tests.web import StubHTTPServer
from tornado.testing import AsyncHTTPTestCase
from trinity import Logger
from web import TrinityApp

logger = Logger.get_logger("InferTaggingIntegrationTest")


class InferTaggingIntegrationTest(AsyncHTTPTestCase):
    """
            Assumes that there is and exiting model which has representation of generated topics.
            Expressed by creating a model in the setUp phase
    """

    config_path = os_path.join(os_path.abspath(os_path.dirname(__file__)),
                               "config.yml")
    stub_http_server = None
    document_response = {
        "id":
        "10",
        "tokens": [
            "content", "network", "router", "wifi", "cable", "ethernet",
Beispiel #11
0
import json

import requests
from summary.core import Document
from summary.core import Summarizer
from trinity import Logger
from util import Unblock, Post
from util.tornado.web import MethodDispatcher

logger = Logger.get_logger("SummaryHandler")


class SummaryHandler(MethodDispatcher):
    @Unblock()
    def put(self):
        request_body = self.request.body
        logger.debug("Request received for summarisation; Request body: %s" %
                     request_body[0:20])
        parameters = json.loads(request_body)

        callback_url = parameters["callback"]
        document_id = parameters["documentId"]
        extracted_text = parameters["extractedText"]
        compression_ratio = int(parameters["compressionRatio"])
        summarizer = Summarizer(compression_ratio=compression_ratio)
        document = Document(doc_id=document_id,
                            text=extracted_text,
                            summarizer=summarizer)

        try:
            logger.info("Generating Summary for document %s" % document_id)
Beispiel #12
0
from tagger.config import config
from tagger.core import LDATagger, TagGenerator
from trinity import Logger
from trinity.contracts import Response

logger = Logger.get_logger("DocumentProcessor")


class DocumentProcessor:
    def __init__(self, content_store_service):
        self.content_store_service = content_store_service

    def error_response(self, error_message):
        logger.error(error_message)
        return Response(status="failure", message=error_message)

    def process(self, docs_tokens_map, tagger_model_path):
        tagger = LDATagger(tagger_model_path,
                           num_topics=config("app.max_topics"))

        tagger.build_or_update_model(docs_tokens_map.values())
        docs_topics_map = tagger.topics_for_documents(docs_tokens_map)
        topics_tokens_map = tagger.topics_to_tokens()
        docs_tags_map = TagGenerator(
            topics_tokens_map).generate_documents_tag_map(
                documents_tokens_map=docs_tokens_map,
                documents_topics_map=docs_topics_map)

        try:
            self.content_store_service.post_documents_logical_topics_associations(
                docs_topics_map, topics_tokens_map)
Beispiel #13
0
import json

import requests
from tornado.web import RequestHandler
from trinity import Logger
from trinity.contracts import Response
from util import Unblock

logger = Logger.get_logger("SimilarityHandler")


class SimilarityHandler(RequestHandler):
    def initialize(self, similarity_service):
        self.similarity_service = similarity_service

    @Unblock()
    def post(self):
        logger.info("Request Received to calculate similarity between posts")

        parameters = json.loads(self.request.body)
        HEADERS = {'Content-Type': 'application/json'}

        try:
            similarity_map = self.similarity_service.find_similarity_across(
                parameters["documents"])
        except Exception as e:
            logger.exception("Error while computing similarity!")
            requests.post(parameters["callback_url"],
                          data=Response(status="failure",
                                        message=e.message).to_json(),
                          headers=HEADERS)
Beispiel #14
0
from unittest import TestCase

from similarity.service import SimilarityService
from trinity import Logger

logger = Logger.get_logger("TestSimilarityService")


class TestSimilarityService(TestCase):
    def test_shouldFindSimilarity(self):
        input_documents = {
            "sha1": {
                "title": "this is a title",
                "body": "this is my first blog, hence sucks",
                "tags": []
            },
            "sha2": {
                "title": "this is a another title blog",
                "body": "this is my second blog, which is much better",
                "tags": []
            },
            "sha3": {
                "title": "a very mature title blog post ",
                "body":
                "We are surrounded by objects in the real world. This is my better second blog",
                "tags": []
            }
        }

        similarity_service = SimilarityService(0.7)
        similarity_map = similarity_service.find_similarity_across(
import json
import os
import os.path as os_path

from tests.web.integration_test_case import IntegrationTestCase
from trinity import Logger

logger = Logger.get_logger("TestSimilarityExperiment")


class TestSimilarityExperiment(IntegrationTestCase):
    ok_status_response = "{'status': 'ok'}"

    def wait(self, condition=None, timeout=None):
        super(TestSimilarityExperiment, self).wait(condition=None, timeout=60)

    def test_should_find_smililarity_between_blogs(self):
        server = self.stub_http_server

        similarity_request = {
            "callback_url": "http://localhost:9001/pipeline/similarity",
            "documents": {}
        }

        blogs_dir = os_path.join(os_path.abspath(os_path.dirname(__file__)), "blogs")
        blogs = os.listdir(blogs_dir)
        blogs.remove(".gitignore")
        if (len(blogs) == 0):
            pass
            return