import numpy as np import pandas as pd from src.utilities.objects import VadChunk from src.utilities import sken_singleton, sken_logger, constants from src.services import question_detection logger = sken_logger.get_logger("snippet_service") # def agent_customer_sequence(input_excel_file): # df = pd.read_excel(input_excel_file) # df.text_ = df.text.astype(str) # df['a_bin'] = 0 # df['b_bin'] = 0 # df.a_bin = df.speaker.apply(lambda x: 0 if x == 'Agent' else 1) # df.b_bin = df.speaker.apply(lambda x: 0 if x == 'Customer' else 1) # df['a_bin_cumsum'] = df.a_bin.cumsum() # df['b_bin_cumsum'] = df.b_bin.cumsum() # df = df.drop(['a_bin', 'b_bin'], axis=1) # df['a_bin'] = df.speaker.apply(lambda x: 1 if x == 'Agent' else 0) # df['b_bin'] = df.speaker.apply(lambda x: 1 if x == 'Customer' else 0) # df['a_con'] = df.a_bin_cumsum * df.a_bin # df['b_con'] = df.b_bin_cumsum * df.b_bin # df.drop(['a_bin_cumsum', 'b_bin_cumsum', 'a_bin', 'b_bin'], axis=1, inplace=True) # df['identifier'] = df.a_con + df.b_con # df['name_idnet'] = df.speaker + "_" + df.identifier.astype(str) # df.drop(['a_con', 'b_con'], axis=1, inplace=True) # df['text_'] = df['text'] + ". " # df1 = df[['name_idnet', 'text_']].groupby(['name_idnet'], as_index=False).sum() # df2 = df.drop_duplicates("name_idnet")[['speaker', 'name_idnet']] # df2 = df2.merge(df1, on='name_idnet') # df2 = df2.drop(["name_idnet"], axis=1)
import os import jsonpickle from flask import Flask, request, Response, render_template, flash, redirect from werkzeug.utils import secure_filename from src.utilities import sken_logger, db, sken_singleton, constants from src.services import dimension_engine from src.services import facet_service logger = sken_logger.get_logger("main") sken_singleton.Singletons.get_instance() db.DBUtils.get_instance() tmp_pro_id = None # used to catch and reset the catch if new product request is made request_count = 0 app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') @app.route("/upload_file", methods=["POST", "GET"]) def upload_csv(): global tmp_pro_id, request_count if request.method == "POST": if 'file' not in request.files:
import time from google.cloud import translate from concurrent.futures import ThreadPoolExecutor import multiprocessing from src.utilities import constants, sken_logger, db import spacy import textacy logger = sken_logger.get_logger("sentence_services") nlp = spacy.load("en_core_web_sm") client = translate.TranslationServiceClient() parent = client.location_path(constants.fetch_constant("translate_project_id"), "global") target_laguages = [ item.language_code for item in client.get_supported_languages( parent).languages[:constants.fetch_constant("translation_depth")] ] def paraphrase_sentence(text): global parent, target_laguages def get_the_other(language): response = client.translate_text( parent=parent, contents=[text], mime_type='text/plain', # mime types: text/plain, text/html source_language_code='en-IN',
import os import pandas as pd from google.cloud import translate from src.utilities import db, constants, sken_logger client = translate.TranslationServiceClient() logger = sken_logger.get_logger("facet_signal") def make_facet_signal_entries(file_path, org_id, prod_id): df = pd.read_excel(file_path) for facet_signal in range(len(df)): sql = """insert into facet_signal (value, facet_id, org_id, product_id) values(%s, (select id from facet where name_ = %s), %s, %s) """ db.DBUtils.get_instance().execute_query( sql, (df.text[facet_signal], df.facet[facet_signal], org_id, prod_id), is_write=True, is_return=False) def praphrase_sentences( text, depth=int(constants.fetch_constant("language_depth")), project_id=constants.fetch_constant("google_project_id")): parent = client.location_path(project_id, "global") x = client.get_supported_languages(parent) target_laguages = [item.language_code for item in x.languages[:depth]] translated_text = [] for language in target_laguages:
from src.utilities import sken_logger, sken_singleton from src.services import encoder logger = sken_logger.get_logger("Sequence_match") def interogation_detectore(sentence): ver_tags = ["VBZ", "VB", "VBD", "VBG", "VBN", "VBP"] doc = sken_singleton.Singletons.get_instance().get_nlp(sentence) matches = sken_singleton.Singletons.get_instance().get_phrase_matcher(doc) if len(matches) != 0: first_filter = doc[matches[0][1]:] tag_list = [item.tag_ for item in first_filter] dep_list = [item.dep_ for item in first_filter] if "subj" in first_filter[0].dep_ or "obj" in first_filter[0].dep_: return { "question": str(first_filter), "subject": "will be in answer", "tag": "direct-question without subject" } try: flag = False for i in range(1, len(tag_list)): for j in range(i + 1, len(tag_list)): if tag_list[i] in ver_tags and "nsubj" in dep_list[j]: print(j, dep_list[j]) flag = True subject_index = j break
from src.utilities import sken_logger, constants from laserembeddings import Laser logger = sken_logger.get_logger("sken_singleton") class Singletons: __instance = None laser_embedder = None @staticmethod def get_instance(): """Static access method""" if Singletons.__instance is None: logger.info("Calling private constructor for embedder initialization ") Singletons() return Singletons.__instance def __init__(self): if Singletons.__instance is not None: raise Exception("The singleton is already initialized you are attempting to initialize it again get lost") else: logger.info("Initializing Laser embedder") self.laser_embedder = Laser() Singletons.__instance = self def perform_embeddings(self, all_sentences): """ This method embeds all the sentences passed using Laser embedder :param all_sentences: :return: list of sentence embeddings
from src.utilities import sken_logger, sken_singleton from src.services import sentence_services from concurrent.futures import ThreadPoolExecutor import time import numpy as np from sklearn.metrics.pairwise import cosine_similarity import json logger = sken_logger.get_logger("match_result") def make_result(signal, snippet, threshold): logger.info("Generating sentences for signal={}".format(signal)) start = time.time() generated_signals = sentence_services.paraphrase_sentence(signal) logger.info("Extracting sentences out of {} signals".format( len(generated_signals) + 1)) with ThreadPoolExecutor(max_workers=len(generated_signals) + 1) as exe: future = exe.map(sentence_services.get_extracted_sentences, list(set(generated_signals + [signal]))) extracted_sentences = [] for item in future: extracted_sentences.extend(item['extracted_sentences']) paraphrased_signals = [] for signal in list(set(generated_signals)): paraphrased_signals.append({"signal_tag": "para", "signal": signal}) extracted_signals = [] for signal in list(set(extracted_sentences)): extracted_signals.append({"signal_tag": "extracted", "signal": signal}) logger.info("Final signal count={}".format( len(paraphrased_signals) + len(extracted_signals)))
import psycopg2 import sys from src.utilities import constants from src.utilities import sken_logger from inspect import getframeinfo, stack from psycopg2 import pool from psycopg2.extras import execute_values import re logger = sken_logger.get_logger("db") class DBUtils: __instance = None sales_pool = None @staticmethod def get_instance(): """ Static access method. """ if DBUtils.__instance is None: logger.info("Calling private constructor for connection pool initialization") DBUtils() return DBUtils.__instance def __init__(self): if DBUtils.__instance is not None: raise Exception("This is a singleton class ") else: logger.info( "Initializing connection pool for database connection, should happen only once during startup. with {}".format( constants.fetch_constant("host")))
import multiprocessing import time from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor import pandas as pd from src.services import text_service, signal_service from src.utilities import sken_singleton, sken_logger, db from src.utilities.objects import VadChunk, Match logger = sken_logger.get_logger("scoring_service") def vad_chunk_match(vad_chunk, product_id): tokens = text_service.make_root_word( text_service.get_tokens(vad_chunk.text)) if len(tokens) > 0: logger.info("Made {} token for snippet = {}".format( len(tokens), vad_chunk.sid)) signals = sken_singleton.Singletons.get_instance().get_cached_signals( )[str(product_id)] def get_signal_scoring(signal): signal_df = signal.token_df threshold = signal.threshold matched = [] matched_vals = [] score = 0 for tok in tokens: for i, val in enumerate(signal_df.val): if val.isin([tok]).any() and tok not in matched_vals:
from src.utilities import constants, db, sken_logger, sken_singleton, sken_exceptions from src.utilities.objects import Signal import pandas as pd import os import pickle from src.services.text_service import make_root_word logger = sken_logger.get_logger('signal_service') def make_product_signal(signal_tokens, scores, threshold, value, product_id): signal_token_lists = [] logger.info("Making signal_df") for token, score in zip(signal_tokens, scores): signal_token_lists.append({ 'val': pd.Series(make_root_word(signal_tokens[token])), 'score': int(score) }) df = pd.DataFrame(signal_token_lists) pickel_string = pickle.dumps(df) sql = "insert into public.product_signal (name, color, value, product_id, created_at, updated_at, is_active, " \ "type, engine, match_type, do_generate) values(%s, '#f09600', %s, %s, now(), now(), true, '', " \ "'RAZOR'" \ ", 'BOTH', false) returning id; " rows, col_names = db.DBUtils.get_instance().execute_query( sql, (constants.fetch_constant("signal_name"), value, product_id), is_write=True,
from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.tokenize import word_tokenize import nltk from src.utilities import sken_logger, constants, sken_exceptions import requests from bs4 import BeautifulSoup from textblob import TextBlob ps = PorterStemmer() lemmatizer = WordNetLemmatizer() logger = sken_logger.get_logger("text_service") not_accepted_pos = ["DT", "VBZ", "PRP", "VBP", "MD", "VB", "IN"] def get_tokens(sentence): """ This method produces the tokens from a sentence :param sentence: :return: token list """ return word_tokenize(sentence) def get_synonyms(sentence): """ This method breaks the sentence into tokens and gets the pos tags for them if the pos tag is not in the list of restricted token list it gets the synonyms for each token using any of the three methods @param sentence: @return:
from src.utilities import sken_logger import pandas as pd import spacy from spacy.matcher import PhraseMatcher import os logger = sken_logger.get_logger("Singleton") class Singletons: __instance = None tagger = None nlp = None sequence_idx = None phrase_matcher = None @staticmethod def get_instance(): """ Static access method. """ if Singletons.__instance is None: logger.info("Calling Singletone private constructor") Singletons() return Singletons.__instance def __init__(self): if Singletons.__instance is not None: raise Exception("This class is a singleton!") else: logger.info("Initializing token tagger")
import numpy as np from src.utilities import sken_singleton, sken_logger, constants, sken_exceptions import os import pandas from src.utilities.db import DBUtils from src.utilities.objects import FacetSignal, Facet, CaughtFacetSignals from src.services import snippet_service, facet_service from concurrent.futures import ThreadPoolExecutor from multiprocessing.pool import ThreadPool pool = ThreadPool(2) logger = sken_logger.get_logger("dimension_engine") def refresh_cached_dims(org_id, prod_id): """ This method refreshes the cached_dimensions singleton ,the method clears the cached dimensions whenever a new product request is made :return: """ with ThreadPoolExecutor(max_workers=2) as executor: executor.submit(sken_singleton.Singletons.get_instance(). get_cached_lq_dims().clear()) executor.submit(sken_singleton.Singletons.get_instance(). get_cached_lq_dims().clear()) logger.info(
import spacy from src.utilities import sken_logger, sken_singleton from textblob import TextBlob import re logger = sken_logger.get_logger("Encodeer") def return_clean_text(sentence): logger.info("Cleaning sentence:{}".format(sentence)) return re.sub(r'[^a-zA-Z ]+', '', sentence).lower() def sentence_breaker(sentence): """ This method extracts all the sentences present in a single long sentence using TextBlob @param sentence: str @return: list of sentences present """ if len(sentence.split()) > 0: testimonial = TextBlob(sentence) sentences = [] for sent in testimonial.sentences: sentences.append(str(sent)) return sentences def get_tagged_sequence(sentence): clean_text = return_clean_text(sentence) tagger = sken_singleton.Singletons.get_instance().get_tagger()