Ejemplo n.º 1
    def __file_parser(self) -> None:
         Iterates through the file object and creates mapping
         in the data.
            file_obj = open(file=self.file_name, mode='r')
            for i in file_obj:
                field = i.split('|')

                if "Applicant\n" in field:
                    d = {field[1]: None}
                if "Mentor\n" in field:
                    d = {field[1]: field[2]}

                if 'ChatGroup' in field:
                    field[3] = clean(field[3])
                    d = {int(field[1]): (field[2], field[3])}

                if 'Message' in field:
                    field[4] = clean(field[4])
                    field[4] = datetime.datetime.fromtimestamp(float(field[4]))
        except IOError:
            print("File not accessible")
            logger.error("File not Found")
Ejemplo n.º 2
def read_data():
    df_train = pd.read_csv(Config.data_path + 'train.csv')
    df_test = pd.read_csv(Config.data_path + 'test.csv')

    df_train['comment_text_clean'] = df_train['comment_text'].apply(
        lambda x: clean(x))
    df_test['comment_text_clean'] = df_test['comment_text'].apply(
        lambda x: clean(x))

    df_test.fillna(' ', inplace=True)
    return (df_train, df_test)
Ejemplo n.º 3
    def costFunction(self, params):
        """Returns Error as a function of Lambda Params.

			params: List of doubles, represents different lambda values,
					lambda_phi is the first param; lambda_pi's are the next
					6 parameters

			Cost - Integer representing number of misclassified results in the 
				   training set.


        # Set Lambda Phi & Pi based upon params
        # 1st Param - Lambda Phi
        # 2nd Param - Lambda Pi
        self.lambda_phi = params[0]
        self.lambda_pi = map(lambda x: x / sum(params[1:]), params[1:])

        # Train the classifier

        # Calculate Error as
        # Number of Training Errors
        error = 0
        for (ques, ans) in self.training_set:

            res = self.get_classification(ques)
            pred_ans = ut.clean(ut.key_max_val_dict(res))

            if pred_ans != ans:
                error += 1

        return error
Ejemplo n.º 4
	def get_classification(self, text):
		text = ut.clean(text)
		uni = nltk.tokenize.word_tokenize(text)
		bi = nltk.bigrams (uni)
		tri = nltk.trigrams (uni)
		temp_lambda = self.lambda_pi
		# Map to store answer to its divergence pairs
		list_of_ans = dict()
		for (ques, ans) in self.training_set:
			fin_val = 0.0
			for t in uni:
				fin_val += temp_lambda[5] * (float(self.unigram_tot_dict.get(t,0))/self.len)
				fin_val += temp_lambda[4] * (float(self.unigram_dict.get((ques,t),0))/len(ques))
			for t in bi:
				fin_val += temp_lambda[3] * (float(self.bigram_tot_dict.get(t,0))/self.unigram_tot_dict.get(t[:1],1))
				fin_val += temp_lambda[2] * (float(self.bigram_dict.get((ques,t),0))/self.unigram_dict.get((ques,t[:1]),1)) 
			for t in tri:
				fin_val += temp_lambda[1] * (float(self.trigram_tot_dict.get(t,0))/self.bigram_tot_dict.get(t[:2],1))
				fin_val += temp_lambda[0] * (float(self.trigram_dict.get((ques,t),0))/self.bigram_dict.get((ques,t[:2]),1))		
			list_of_ans[self.training_orig.get(ans, ans)] = fin_val
		# Return Weighted list of responses
		return list_of_ans
Ejemplo n.º 5
	def classify(self,Xtest):
		Prediction = list()
		for x in Xtest:
		return Prediction
Ejemplo n.º 6
    def __init__(self, lambda_phi, lambda_pi, training_set):
        self.lambda_phi = lambda_phi
        self.lambda_pi = lambda_pi

        # Clean (Lower Case; Remove Punctuation) Training Set
        # Keep backup of formatted data as well
        self.training_orig = dict()
        self.training_set = []
        for (ques, ans) in training_set:
            self.training_set.append((ut.clean(ques), ut.clean(ans)))
            self.training_orig[ut.clean(ans)] = ans

        print self.training_set

        # Initialize Maps for Phi and Pi, to learn their values
        self.pi_dict = dict()
        self.phi_dict = dict()
Ejemplo n.º 7
	def __init__ (self, lambda_phi, lambda_pi, training_set):
		self.lambda_phi = lambda_phi
		self.lambda_pi = lambda_pi
		# Clean (Lower Case; Remove Punctuation) Training Set
		# Keep backup of formatted data as well
		self.training_orig = dict()
		self.training_set = []
		for (ques, ans) in training_set:
			self.training_orig[ut.clean(ans)] = ans
		print self.training_set

		# Initialize Maps for Phi and Pi, to learn their values
		self.pi_dict = dict()
		self.phi_dict = dict()
Ejemplo n.º 8
    def classify(self, Xtest):

        Prediction = list()

        for x in Xtest:

        return Prediction
Ejemplo n.º 9
	def __init__ (self, path = "Data/Dataset1.csv"):
		cr = csv.reader(open(path,"rb"))
		temp = [(row[1], row[0]) for row in cr]
		self.tag_list = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
		self.data_set_total = []
		self.data_set_total = [(self.feature_extractor(ut.clean(n)), g) for (n,g) in temp]
		self.train_set = self.data_set_total
Ejemplo n.º 10
    def __init__(self, lambda_pi, training_set):

        self.lambda_pi = lambda_pi

        # Clean (Lower Case; Remove Punctuation) Training Set
        # Keep backup of formatted data as well
        self.training_orig = dict()
        self.training_set = []
        for (ques, ans) in training_set:
            self.training_set.append((ut.clean(ques), ut.clean(ans)))
            self.training_orig[ut.clean(ans)] = ans

        self.unigram_dict = dict()
        self.bigram_dict = dict()
        self.trigram_dict = dict()
        self.unigram_tot_dict = dict()
        self.bigram_tot_dict = dict()
        self.trigram_tot_dict = dict()
        self.len = 0
Ejemplo n.º 11
    def get_classification(self, text):
        text = ut.clean(text)

        # Map to store answer to its divergence pairs
        list_of_ans = dict()

        # Calculate Denominator of Cond. Prob. P(v|W); v - Single word in the answer; W - User utterance
        # Calculate once, since its independent of indv. responses/ answers
        pv_den = 0.0

        # Store PI_Ws(w_i) [i = 1...n] for each Ws for use in the numerator as well
        pi_prod = dict()

        # Iterate over all questions
        for (ques, ans) in self.training_set:
            for word in text.split():
                pi_prod[ques] = pi_prod.get(ques, 1) * self.pi_dict.get(
                    (ques, word), 1
                )  # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0
                if (pi_prod[ques] == 0):
            pv_den += pi_prod[ques]

        # Calculate metric for answer that can be given
        # Iterate over all answers in training_Set
        for (question, answer) in self.training_set:

            # Iterate over individual words in every answer
            for word in answer.split():

                # Calculate numerator of Cond. Prob. P(v|W)
                pv_num = 0.0
                for ques, ans in self.training_set:
                    pv_num += self.phi_dict.get((ans, word), 0) * pi_prod[
                        ques]  # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0

                # Calculate pv for each word 'v'
                pv = pv_num / pv_den

                # In KL Divergence; log0 = 0
                log_term = pv / self.phi_dict.get(
                    (answer, word), 0
                )  # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0
                if (log_term != 0):
                    log_term = np.log10(log_term)

                    answer, answer)] = list_of_ans.get(
                        self.training_orig.get(answer, answer),
                        0) + (pv * log_term)

        # Return Weighted list of responses
        return list_of_ans
Ejemplo n.º 12
    def __init__(self, path="Data/Dataset1.csv"):
        cr = csv.reader(open(path, "rb"))
        temp = [(row[1], row[0]) for row in cr]

        self.tag_list = nltk.data.load(
        self.data_set_total = []

        self.data_set_total = [(self.feature_extractor(ut.clean(n)), g)
                               for (n, g) in temp]
        self.train_set = self.data_set_total
Ejemplo n.º 13
	def __init__ (self,lambda_pi, training_set):
		self.lambda_pi = lambda_pi
		# Clean (Lower Case; Remove Punctuation) Training Set
		# Keep backup of formatted data as well
		self.training_orig = dict()
		self.training_set = []
		for (ques, ans) in training_set:
			self.training_orig[ut.clean(ans)] = ans

		self.unigram_dict = dict()
		self.bigram_dict = dict()
		self.trigram_dict = dict()
		self.unigram_tot_dict = dict()
		self.bigram_tot_dict = dict()
		self.trigram_tot_dict = dict()
		self.len = 0
def data_prep(df_train, df_test, fast_text_embeddings):
    def extract_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    # cleaning the comment text
    df_train['comment_text'] = df_train['comment_text'].apply(
        lambda x: clean(x))
    df_test['comment_text'] = df_test['comment_text'].apply(lambda x: clean(x))

    # Handling the nan values
    train_features = df_train["comment_text"].fillna("fillna").values
    train_labels = df_train[Config.labels].values
    test_features = df_test["comment_text"].fillna("fillna").values

    # data preprocessing
    tokenizer = text.Tokenizer(num_words=Config.max_features)
    tokenizer.fit_on_texts(list(train_features) + list(test_features))
    train_features = tokenizer.texts_to_sequences(train_features)
    test_features = tokenizer.texts_to_sequences(test_features)
    train_features = sequence.pad_sequences(train_features,
    test_features = sequence.pad_sequences(test_features,

    # creating the embedding matrix
    embeddings_index = dict(
        extract_coefs(*emb.rstrip().rsplit(' '))
        for emb in open(fast_text_embeddings))
    word_index = tokenizer.word_index
    nb_words = min(Config.max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, Config.embedding_size))

    for word, idx in word_index.items():
        if idx >= Config.max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

    return train_features, train_labels, test_features, embedding_matrix
Ejemplo n.º 15
    def __init__(self, lambda_phi, lambda_pi, training_set):
        """Constructor; Initialize all class attributes.

			lambda_phi: Lambda Values to indicate the relative importance of individual Answers vs The entire Set.
			lambda_pi: Lambda Values to indicate relative importance of indv. words vs bigram words vs trigram words in
					   both individual questions and the entire set.
			training_set : Training data for Classification [Tuple Format].

        self.lambda_phi = lambda_phi
        self.lambda_pi = lambda_pi

        # Clean (Lower Case; Remove Punctuation) Training Set
        # Keep backup of formatted data as well
        self.training_orig = dict()
        self.training_set = []
        for (ques, ans) in training_set:
            self.training_set.append((ut.clean(ques), ut.clean(ans)))
            self.training_orig[ut.clean(ans)] = ans

            # Initialize Maps for Phi and Pi, to learn their values
        self.pi_dict = dict()
        self.phi_dict = dict()
Ejemplo n.º 16
	def get_classification(self, text):
		text = ut.clean(text)
		# Map to store answer to its divergence pairs
		list_of_ans = dict()

		# Calculate Denominator of Cond. Prob. P(v|W); v - Single word in the answer; W - User utterance
		# Calculate once, since its independent of indv. responses/ answers
		pv_den = 0.0

		# Store PI_Ws(w_i) [i = 1...n] for each Ws for use in the numerator as well
		pi_prod = dict()

		# Iterate over all questions
		for (ques, ans) in self.training_set:
			for word in text.split():
				pi_prod[ques] = pi_prod.get(ques , 1) * self.pi_dict.get((ques,word), 1) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0
				if(pi_prod[ques] == 0):
			pv_den += pi_prod[ques]
		# Calculate metric for answer that can be given
		# Iterate over all answers in training_Set
		for (question, answer) in self.training_set:
			# Iterate over individual words in every answer
			for word in answer.split():

				# Calculate numerator of Cond. Prob. P(v|W)
				pv_num = 0.0
				for ques,ans in self.training_set:
					pv_num += self.phi_dict.get((ans, word), 0) * pi_prod[ques]  # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0

				# Calculate pv for each word 'v'
				pv = pv_num / pv_den

				# In KL Divergence; log0 = 0 
				log_term = pv / self.phi_dict.get((answer,word), 0) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0
				if(log_term != 0):
					log_term = np.log10(log_term)

				list_of_ans[self.training_orig.get(answer, answer)] = list_of_ans.get(self.training_orig.get(answer, answer),0) + (pv * log_term)
		# Return Weighted list of responses
		return list_of_ans
Ejemplo n.º 17
    def get_classification(self, text):
        text = ut.clean(text)

        uni = nltk.tokenize.word_tokenize(text)

        bi = nltk.bigrams(uni)
        tri = nltk.trigrams(uni)

        temp_lambda = self.lambda_pi

        # Map to store answer to its divergence pairs
        list_of_ans = dict()

        for (ques, ans) in self.training_set:

            fin_val = 0.0

            for t in uni:
                fin_val += temp_lambda[5] * (
                    float(self.unigram_tot_dict.get(t, 0)) / self.len)
                fin_val += temp_lambda[4] * (
                    float(self.unigram_dict.get((ques, t), 0)) / len(ques))

            for t in bi:
                fin_val += temp_lambda[3] * (
                    float(self.bigram_tot_dict.get(t, 0)) /
                    self.unigram_tot_dict.get(t[:1], 1))
                fin_val += temp_lambda[2] * (
                        (ques, t), 0)) / self.unigram_dict.get(
                            (ques, t[:1]), 1))

            for t in tri:
                fin_val += temp_lambda[1] * (
                    float(self.trigram_tot_dict.get(t, 0)) /
                    self.bigram_tot_dict.get(t[:2], 1))
                fin_val += temp_lambda[0] * (
                        (ques, t), 0)) / self.bigram_dict.get(
                            (ques, t[:2]), 1))

            list_of_ans[self.training_orig.get(ans, ans)] = fin_val

        # Return Weighted list of responses
        return list_of_ans
Ejemplo n.º 18
	def costFunction (self, params):
		self.lambda_phi = params[0]
		self.lambda_pi = map(lambda x: x/sum(params[1:]), params[1:] )
		#print self.lambda_pi

		# Train the classifier

		# Calculate Error
		error = 0.0
		for (ques,ans) in self.training_set:
			res = self.get_classification(ques)
			pred_ans = ut.clean(ut.key_max_val_dict (res))
			# Add one for every misclassified result
			if(pred_ans != ans):
#				print pred_ans, ans
#				raw_input()
				error += 1

		return error
Ejemplo n.º 19
	def feature_extractor(self,sentence):
		features = {}
		words = word_tokenize(ut.clean(sentence))
		features["length"] = len(words)
		for q in ["how","where","when","what","why"]:
			features["has(%s)" % q] = (q in words)
		temp = [ (b) for (a,b) in pos_tag(words)]
		tag_dict = dict ()
		for tag in temp:
			tag_dict [tag] = tag_dict.get(tag,0) + 1
		for tags in self.tag_list.classifier().labels():
			features["count(%s)" % tags] = tag_dict.get(tags,0)
        	#features["has(%s)" % tags] = (tag_dict.get(tags,0) != 0)
		return features
Ejemplo n.º 20
    def costFunction(self, params):

        self.lambda_pi = map(lambda x: x / sum(params), params)
        #print self.lambda_pi

        # Train the classifier

        # Calculate Error
        error = 0.0
        for (ques, ans) in self.training_set:
            res = self.get_classification(ques)
            pred_ans = ut.clean(ut.key_max_val_dict(res))
            # Add one for every misclassified result
            if (pred_ans != ans):
                #				print pred_ans, ans
                #				raw_input()
                error += 1

        return error
Ejemplo n.º 21
    def feature_extractor(self, sentence):
        features = {}

        words = word_tokenize(ut.clean(sentence))
        features["length"] = len(words)

        for q in ["how", "where", "when", "what", "why"]:
            features["has(%s)" % q] = (q in words)

        temp = [(b) for (a, b) in pos_tag(words)]

        tag_dict = dict()

        for tag in temp:
            tag_dict[tag] = tag_dict.get(tag, 0) + 1

        for tags in self.tag_list.classifier().labels():
            features["count(%s)" % tags] = tag_dict.get(tags, 0)

#features["has(%s)" % tags] = (tag_dict.get(tags,0) != 0)

        return features
def prep_dummy_csv():
    from hashlib import sha256
    # We need to hash customer emails as a semi-unique identifier
    def hash(inp):
        h_e = sha256()
        return h_e.hexdigest()[:6]
    concat = u.concat('CSV_Files')
    to_dummy_csv = u.clean(concat)
    email_hashes = [hash(i) for i in to_dummy_csv['Email_Billing']]
    to_dummy_csv['Email_Billing'] = email_hashes
    to_dummy_csv[['Full_Name_Billing', 'Full_Name_Shipping', 'Address_1_Shipping']]='Restricted'
    to_dummy_csv.drop('Unnamed:_0', axis=1, inplace=True)
    to_dummy_csv['State_Name_Shipping'].loc[to_dummy_csv['State_Name_Shipping'].isna()] = 'Missing'
    to_dummy_csv['Discount_Amount'].loc[to_dummy_csv['Discount_Amount'].isna()] = to_dummy_csv['Discount_Amount'].mean()
    to_dummy_csv= to_dummy_csv.loc[to_dummy_csv['Order_Date']<='2020-12-31-23:59']
Ejemplo n.º 23
import trysearch as ts
import AlchemyAPI as AP

Dialog_Manager = DM.dialog_manager()
q_class = QC.q_classification()

var = 1
while var:
	print '\n'
	string = raw_input(" Enter the question: ")
	if string in ["end","End","exit","Exit"]:
		var = 0
		temp = [ (a,b) for (a,b) in pos_tag(nltk.tokenize.word_tokenize(ut.clean(string)))]
		temp1 = dict()
		temp2 = ''
		for (a,b) in temp:
			if a == 'i':
				a = 'i'
			elif b == 'RB' or b == 'VB':
				temp2 += 'action '
				temp1['action'] = a
			elif b == 'VBP' or b == 'NN':
				temp2 += 'object '
				temp1['object'] = a
				temp2 += a + ' '
		result_string = Dialog_Manager.get_reply(temp2)
		res_class = q_class.classify([string])
Ejemplo n.º 24
elif sys.argv[1] == 'pcp':
    frame = serve_frame('CSV_Files')
    dfpa = ma.ProductAnalysis(frame)
elif sys.argv[1] == 'pcp':
    frame = serve_frame('CSV_Files')
    dfpa = ma.ProductAnalysis(frame)
elif sys.argv[1] == 'plg':
    frame = serve_frame('CSV_Files')
    dfpa = ma.ProductAnalysis(frame)

file = u.concat('CSV_FIles')
# Test slice for functionality
file = u.clean(file)
cp1 = u.FieldLoader(file)
'''Work log:
functionality and indexing for time: reading, saving and analyzing time fields (completed?)

Streamline initializations: maybe make a initialization function in Analysis so we can simply click on the py file
and run the program

**As of now customer_profile line 63 should protect from repeated inputs of sales information. This will need to be tested.

***Concat in Utility needs to be corrected! atm there is only one customer!! Best guess is recent addition--concat
Ejemplo n.º 25
def serve_frame(file):
    frame = u.concat(file)
    frame = u.clean(frame)
    return frame
Ejemplo n.º 26
    def get_classification(self, text):
        """Classify 'text' based upon learned values in 'train', and return weighted list of responses.

			text: The text (or Question, in this case) to be classified.

			A list (as a dict) of the weighted responses for the given 'text',
			The key being the response, and the value being the KL-Divergence.
			For Example,

			["This is the way"] => -0.0032,
			["That is not true"] => -0.023,

        # Clean the input (Remove Punctuation, Lower Case)
        text = ut.clean(text)

        # Map to store answer to its divergence pairs
        list_of_ans = dict()

        # Calculate Denominator of Cond. Prob. P(v|W); v - Single word in the answer; W - User utterance
        # Calculate once, since its independent of indv. responses/ answers
        pv_den = 0.0

        # Store PI_Ws(w_i) [i = 1...n] for each Ws for use in the numerator as well
        pi_prod = dict()

        # Iterate over all questions
        for (ques, ans) in self.training_set:
            for word in text.split():
                pi_prod[ques] = pi_prod.get(ques, 1) * self.pi_dict.get(
                    (ques, word), 1
                )  # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0
                if pi_prod[ques] == 0:
            pv_den += pi_prod[ques]

            # Calculate metric for answer that can be given
            # Iterate over all answers in training_Set
        for (question, answer) in self.training_set:

            # Iterate over individual words in every answer
            for word in answer.split():

                # Calculate numerator of Cond. Prob. P(v|W)
                pv_num = 0.0
                for ques, ans in self.training_set:
                    pv_num += (
                        self.phi_dict.get((ans, word), 0) * pi_prod[ques]
                    )  # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0

                    # Calculate pv for each word 'v'
                pv = pv_num / pv_den

                # By Definition of KL Divergence; log0 = 0
                log_term = pv / self.phi_dict.get(
                    (answer, word), 0
                )  # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0
                if log_term != 0:
                    log_term = np.log10(log_term)

                list_of_ans[self.training_orig.get(answer, answer)] = list_of_ans.get(
                    self.training_orig.get(answer, answer), 0
                ) + (pv * log_term)

                # Return Weighted list of responses
        return list_of_ans
Ejemplo n.º 27
Dialog_Manager = DM.dialog_manager()
q_class = QC.q_classification()

var = 1
while var:

    print '\n'
    string = raw_input(" Enter the question: ")

    if string in ["end", "End", "exit", "Exit"]:
        var = 0
        temp = [
            (a, b)
            for (a,
                 b) in pos_tag(nltk.tokenize.word_tokenize(ut.clean(string)))
        temp1 = dict()
        temp2 = ''
        for (a, b) in temp:
            if a == 'i':
                a = 'i'
            elif b == 'RB' or b == 'VB':
                temp2 += 'action '
                temp1['action'] = a
            elif b == 'VBP' or b == 'NN':
                temp2 += 'object '
                temp1['object'] = a
                temp2 += a + ' '
        result_string = Dialog_Manager.get_reply(temp2)