from similarity.service import SimilarityService, ResourceSimilarityHandler from similarity.service.similarity_handler import SimilarityHandler from summary.service import SummaryHandler from tagger.config import load, config from tagger.service import DocumentsHandler, ContentStoreService, DocumentProcessor, DocumentHandler from tornado.httpserver import HTTPServer from tornado.ioloop import IOLoop from tornado.web import Application from trinity import Logger from web import ConfigHandler from web import StatusHandler from web import VersionHandler logger = Logger.get_logger("TrinityApp") class TrinityApp(Application): def __init__(self, config_file): self.config_file = config_file load(config_file) content_store_service = ContentStoreService() handlers = [( '/tagger/document', DocumentHandler, { "content_store_service": content_store_service, "document_processor": DocumentProcessor(content_store_service) }, ), (
import json import os from copy import deepcopy import yaml from trinity import Logger logger = Logger.get_logger("Config") class _Config: """ TODO: Pramod - Refactor and simplify the process of making config a <em>singleton</em> """ DEFAULT_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "../config/sample.yml") _ = {} def load(self, config_file): default_config = open(self.DEFAULT_CONFIG_FILE, "r") self._ = yaml.load(default_config) if config_file is not None and os.path.isfile(config_file): override_config = open(config_file, "r") overriden_configs = yaml.load(override_config) if not overriden_configs: logger.info("Empty overrides") return logger.info("Overriding Config Values") logger.info(json.dumps(overriden_configs, indent=4)) self._ = merge_dictionaries(self._, overriden_configs) logger.info("Full config") logger.info(json.dumps(self._, indent=4))
import os import threading from collections import defaultdict import cPickle from gensim.corpora import Dictionary from gensim.models import LdaModel from lockfile import FileLock from trinity import Logger logger = Logger.get_logger("LDATagger") class LDATagger: _lda_model = None _dictionary = None _lda_model_path = None _dictionary_path = None DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model") DEFAULT_NUM_TOPICS = 1000 def __init__(self, model_path=DEFAULT_MODEL_PATH, num_topics=DEFAULT_NUM_TOPICS, lock=threading.Lock()): self.save_model_lock = lock if os.path.isfile(model_path): raise Exception("Invalid Model Path; Should Be a Directory") if not os.path.exists(model_path): os.makedirs(model_path)
from math import log from stats import distance_deviation from trinity import Logger logger = Logger.get_logger("Summarizer") class Summarizer: ALPHA = 1 BETA = 1 GAMMA = 2 DEFAULT_COMPRESSION = 10 NUMBER_OF_WEIGHTING_MEASURES = 3.0 def __init__(self, compression_ratio=DEFAULT_COMPRESSION): self.compression_ratio = compression_ratio def summary_length(self, processed_document): compressed_length_based_on_document = processed_document.number_of_sentences() * self.compression_ratio / 100 if (compressed_length_based_on_document == 0): return processed_document.number_of_sentences() return compressed_length_based_on_document def summarize_using_weighing_measures(self, processed_document): number_of_sentences_in_summary = self.summary_length(processed_document) summary_sentence_ids = [] for i in range(0, number_of_sentences_in_summary): highest_weighed_sentence_number = self.highest_weighed_sentence_id(processed_document) summary_sentence_ids.append(highest_weighed_sentence_number) processed_document.eliminate_processed_sentence(highest_weighed_sentence_number) summary = [processed_document.sentence(sentence_number) for sentence_number in sorted(summary_sentence_ids)]
from collections import defaultdict from stats import jaccard_coefficient, CosineSimilarity from summary.core import TokensSpace from text import TextProcessor from trinity import Logger logger = Logger.get_logger("ResourceSimilarityService") def similarity_co_efficient((value1, value2), text_processor=TextProcessor()): a_text_value = value1["value"] other_text_value = value2["value"] if (not (isinstance(a_text_value, str) or isinstance( a_text_value, unicode) or isinstance(a_text_value, list)) ) or (not (isinstance(other_text_value, str) or isinstance( other_text_value, unicode) or isinstance(other_text_value, list))): return 0 if not isinstance(a_text_value, list): a_text_value = text_processor.tokenize(value1["value"]) if not isinstance(other_text_value, list): other_text_value = text_processor.tokenize(value2["value"]) return _co_efficient_calculation_mechanism(a_text_value)(a_text_value, other_text_value) def _co_efficient_calculation_mechanism(field_value, TOKENS_LENGTH_THRESHOLD=10): return jaccard_coefficient if len( field_value) < TOKENS_LENGTH_THRESHOLD else cosine_similarity
import json import httplib import requests from tagger.config import config from tagger.service.contracts import DocumentsResponse, DocumentTopicsMixturesRequest, DocumentsTagsRequest, \ DocumentResponse from tagger.service.contracts.document_topics_mixture_requests import DocumentTopicsMixtureRequest from tagger.service.contracts.documents_tags_request import DocumentTagsRequest from trinity import Logger logger = Logger.get_logger("ContentStoreService") class ContentStoreService: HEADERS = {'Content-Type': 'application/json'} CONNECTION_ERROR = "Connection Error While Fetching" STATUS_FAILED = "Failed. Status Not Ok" def fetch_documents(self, documents_request): get_docs_url = config("content_store.host") + config( "content_store.get_docs") response = None try: logger.info("Fetching %s Documents to Tag from %s" % (documents_request, get_docs_url)) response = requests.get(get_docs_url, data=documents_request, headers=self.HEADERS) if response.status_code is not httplib.OK: logger.error(response.text)
import json import requests from similarity.service.resource_similarity_service import all_resources_similarity from tornado.web import RequestHandler from trinity import Logger from trinity.contracts import Response from util import Unblock logger = Logger.get_logger("ResourceSimilarityHandler") class ResourceSimilarityHandler(RequestHandler): def initialize(self, similarity_threshold): self.similarity_threshold = similarity_threshold @Unblock() def post(self): """ Accepts a request with json as below: { "resources" : [resource1,resource2..] "callback_url": "callback url" } resource = { "id": sha123, "field1" : { "value": bla, "weight" : optional },
import json import os import httplib from tagger.config import config from tagger.core import LDATagger from tornado.web import RequestHandler from trinity import Logger from trinity.contracts import Response logger = Logger.get_logger("DocumentsHandler") class DocumentHandler(RequestHandler): def initialize(self, document_processor, content_store_service): self.document_processor = document_processor self.content_store_service = content_store_service def post(self): document_id = json.loads(self.request.body)["documentId"] logger.info("Request to infer topics for document %s received" % document_id) try: document_response = self.content_store_service.fetch_document( document_id) except Exception as e: document_fetch_error = "unable to fetch Document for Tagging" logger.info(e) logger.error("%s for Id %s" % (document_fetch_error, document_id))
def __init__(self): self.logger = Logger.get_logger("Unblock") self.executor = ThreadPoolExecutor(max_workers=100) self.logger.info("Created a ThreadPool")
import json import os import os.path as os_path import shutil import unittest from tagger.config import load, config from tagger.core import LDATagger from tagger.service.contracts import DocumentsResponse from tests.web import StubHTTPServer from tornado.testing import AsyncHTTPTestCase from trinity import Logger from web import TrinityApp logger = Logger.get_logger("InferTaggingIntegrationTest") class InferTaggingIntegrationTest(AsyncHTTPTestCase): """ Assumes that there is and exiting model which has representation of generated topics. Expressed by creating a model in the setUp phase """ config_path = os_path.join(os_path.abspath(os_path.dirname(__file__)), "config.yml") stub_http_server = None document_response = { "id": "10", "tokens": [ "content", "network", "router", "wifi", "cable", "ethernet",
import json import requests from summary.core import Document from summary.core import Summarizer from trinity import Logger from util import Unblock, Post from util.tornado.web import MethodDispatcher logger = Logger.get_logger("SummaryHandler") class SummaryHandler(MethodDispatcher): @Unblock() def put(self): request_body = self.request.body logger.debug("Request received for summarisation; Request body: %s" % request_body[0:20]) parameters = json.loads(request_body) callback_url = parameters["callback"] document_id = parameters["documentId"] extracted_text = parameters["extractedText"] compression_ratio = int(parameters["compressionRatio"]) summarizer = Summarizer(compression_ratio=compression_ratio) document = Document(doc_id=document_id, text=extracted_text, summarizer=summarizer) try: logger.info("Generating Summary for document %s" % document_id)
from tagger.config import config from tagger.core import LDATagger, TagGenerator from trinity import Logger from trinity.contracts import Response logger = Logger.get_logger("DocumentProcessor") class DocumentProcessor: def __init__(self, content_store_service): self.content_store_service = content_store_service def error_response(self, error_message): logger.error(error_message) return Response(status="failure", message=error_message) def process(self, docs_tokens_map, tagger_model_path): tagger = LDATagger(tagger_model_path, num_topics=config("app.max_topics")) tagger.build_or_update_model(docs_tokens_map.values()) docs_topics_map = tagger.topics_for_documents(docs_tokens_map) topics_tokens_map = tagger.topics_to_tokens() docs_tags_map = TagGenerator( topics_tokens_map).generate_documents_tag_map( documents_tokens_map=docs_tokens_map, documents_topics_map=docs_topics_map) try: self.content_store_service.post_documents_logical_topics_associations( docs_topics_map, topics_tokens_map)
import json import requests from tornado.web import RequestHandler from trinity import Logger from trinity.contracts import Response from util import Unblock logger = Logger.get_logger("SimilarityHandler") class SimilarityHandler(RequestHandler): def initialize(self, similarity_service): self.similarity_service = similarity_service @Unblock() def post(self): logger.info("Request Received to calculate similarity between posts") parameters = json.loads(self.request.body) HEADERS = {'Content-Type': 'application/json'} try: similarity_map = self.similarity_service.find_similarity_across( parameters["documents"]) except Exception as e: logger.exception("Error while computing similarity!") requests.post(parameters["callback_url"], data=Response(status="failure", message=e.message).to_json(), headers=HEADERS)
from unittest import TestCase from similarity.service import SimilarityService from trinity import Logger logger = Logger.get_logger("TestSimilarityService") class TestSimilarityService(TestCase): def test_shouldFindSimilarity(self): input_documents = { "sha1": { "title": "this is a title", "body": "this is my first blog, hence sucks", "tags": [] }, "sha2": { "title": "this is a another title blog", "body": "this is my second blog, which is much better", "tags": [] }, "sha3": { "title": "a very mature title blog post ", "body": "We are surrounded by objects in the real world. This is my better second blog", "tags": [] } } similarity_service = SimilarityService(0.7) similarity_map = similarity_service.find_similarity_across(
import json import os import os.path as os_path from tests.web.integration_test_case import IntegrationTestCase from trinity import Logger logger = Logger.get_logger("TestSimilarityExperiment") class TestSimilarityExperiment(IntegrationTestCase): ok_status_response = "{'status': 'ok'}" def wait(self, condition=None, timeout=None): super(TestSimilarityExperiment, self).wait(condition=None, timeout=60) def test_should_find_smililarity_between_blogs(self): server = self.stub_http_server similarity_request = { "callback_url": "http://localhost:9001/pipeline/similarity", "documents": {} } blogs_dir = os_path.join(os_path.abspath(os_path.dirname(__file__)), "blogs") blogs = os.listdir(blogs_dir) blogs.remove(".gitignore") if (len(blogs) == 0): pass return