def __file_parser(self) -> None: """ Iterates through the file object and creates mapping in the data. """ try: file_obj = open(file=self.file_name, mode='r') for i in file_obj: field = i.split('|') if "Applicant\n" in field: d = {field[1]: None} self._user.update(d) if "Mentor\n" in field: d = {field[1]: field[2]} self._user.update(d) if 'ChatGroup' in field: field[3] = clean(field[3]) d = {int(field[1]): (field[2], field[3])} self._chat_groups.update(d) if 'Message' in field: field[4] = clean(field[4]) field[4] = datetime.datetime.fromtimestamp(float(field[4])) self._messages.append(field) except IOError: print("File not accessible") logger.error("File not Found") finally: file_obj.close()
def read_data(): df_train = pd.read_csv(Config.data_path + 'train.csv') df_test = pd.read_csv(Config.data_path + 'test.csv') df_train['comment_text_clean'] = df_train['comment_text'].apply( lambda x: clean(x)) df_test['comment_text_clean'] = df_test['comment_text'].apply( lambda x: clean(x)) df_test.fillna(' ', inplace=True) return (df_train, df_test)
def costFunction(self, params): """Returns Error as a function of Lambda Params. Args: params: List of doubles, represents different lambda values, lambda_phi is the first param; lambda_pi's are the next 6 parameters Returns: Cost - Integer representing number of misclassified results in the training set. """ # Set Lambda Phi & Pi based upon params # 1st Param - Lambda Phi # 2nd Param - Lambda Pi self.lambda_phi = params[0] self.lambda_pi = map(lambda x: x / sum(params[1:]), params[1:]) # Train the classifier self.train() # Calculate Error as # Number of Training Errors error = 0 for (ques, ans) in self.training_set: res = self.get_classification(ques) pred_ans = ut.clean(ut.key_max_val_dict(res)) if pred_ans != ans: error += 1 return error
def get_classification(self, text): text = ut.clean(text) uni = nltk.tokenize.word_tokenize(text) bi = nltk.bigrams (uni) tri = nltk.trigrams (uni) temp_lambda = self.lambda_pi # Map to store answer to its divergence pairs list_of_ans = dict() for (ques, ans) in self.training_set: fin_val = 0.0 for t in uni: fin_val += temp_lambda[5] * (float(self.unigram_tot_dict.get(t,0))/self.len) fin_val += temp_lambda[4] * (float(self.unigram_dict.get((ques,t),0))/len(ques)) for t in bi: fin_val += temp_lambda[3] * (float(self.bigram_tot_dict.get(t,0))/self.unigram_tot_dict.get(t[:1],1)) fin_val += temp_lambda[2] * (float(self.bigram_dict.get((ques,t),0))/self.unigram_dict.get((ques,t[:1]),1)) for t in tri: fin_val += temp_lambda[1] * (float(self.trigram_tot_dict.get(t,0))/self.bigram_tot_dict.get(t[:2],1)) fin_val += temp_lambda[0] * (float(self.trigram_dict.get((ques,t),0))/self.bigram_dict.get((ques,t[:2]),1)) list_of_ans[self.training_orig.get(ans, ans)] = fin_val # Return Weighted list of responses return list_of_ans
def classify(self,Xtest): Prediction = list() for x in Xtest: Prediction.append(self.classifier.classify(self.feature_extractor(ut.clean(x)))) return Prediction
def __init__(self, lambda_phi, lambda_pi, training_set): self.lambda_phi = lambda_phi self.lambda_pi = lambda_pi # Clean (Lower Case; Remove Punctuation) Training Set # Keep backup of formatted data as well self.training_orig = dict() self.training_set = [] for (ques, ans) in training_set: self.training_set.append((ut.clean(ques), ut.clean(ans))) self.training_orig[ut.clean(ans)] = ans print self.training_set # Initialize Maps for Phi and Pi, to learn their values self.pi_dict = dict() self.phi_dict = dict()
def __init__ (self, lambda_phi, lambda_pi, training_set): self.lambda_phi = lambda_phi self.lambda_pi = lambda_pi # Clean (Lower Case; Remove Punctuation) Training Set # Keep backup of formatted data as well self.training_orig = dict() self.training_set = [] for (ques, ans) in training_set: self.training_set.append((ut.clean(ques),ut.clean(ans))) self.training_orig[ut.clean(ans)] = ans print self.training_set # Initialize Maps for Phi and Pi, to learn their values self.pi_dict = dict() self.phi_dict = dict()
def classify(self, Xtest): Prediction = list() for x in Xtest: Prediction.append( self.classifier.classify(self.feature_extractor(ut.clean(x)))) return Prediction
def __init__ (self, path = "Data/Dataset1.csv"): cr = csv.reader(open(path,"rb")) temp = [(row[1], row[0]) for row in cr] self.tag_list = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle') self.data_set_total = [] self.data_set_total = [(self.feature_extractor(ut.clean(n)), g) for (n,g) in temp] self.train_set = self.data_set_total self.train()
def __init__(self, lambda_pi, training_set): self.lambda_pi = lambda_pi # Clean (Lower Case; Remove Punctuation) Training Set # Keep backup of formatted data as well self.training_orig = dict() self.training_set = [] for (ques, ans) in training_set: self.training_set.append((ut.clean(ques), ut.clean(ans))) self.training_orig[ut.clean(ans)] = ans self.unigram_dict = dict() self.bigram_dict = dict() self.trigram_dict = dict() self.unigram_tot_dict = dict() self.bigram_tot_dict = dict() self.trigram_tot_dict = dict() self.len = 0 self.train()
def get_classification(self, text): text = ut.clean(text) # Map to store answer to its divergence pairs list_of_ans = dict() # Calculate Denominator of Cond. Prob. P(v|W); v - Single word in the answer; W - User utterance # Calculate once, since its independent of indv. responses/ answers pv_den = 0.0 # Store PI_Ws(w_i) [i = 1...n] for each Ws for use in the numerator as well pi_prod = dict() # Iterate over all questions for (ques, ans) in self.training_set: for word in text.split(): pi_prod[ques] = pi_prod.get(ques, 1) * self.pi_dict.get( (ques, word), 1 ) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 if (pi_prod[ques] == 0): break pv_den += pi_prod[ques] # Calculate metric for answer that can be given # Iterate over all answers in training_Set for (question, answer) in self.training_set: # Iterate over individual words in every answer for word in answer.split(): # Calculate numerator of Cond. Prob. P(v|W) pv_num = 0.0 for ques, ans in self.training_set: pv_num += self.phi_dict.get((ans, word), 0) * pi_prod[ ques] # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 # Calculate pv for each word 'v' pv = pv_num / pv_den # In KL Divergence; log0 = 0 log_term = pv / self.phi_dict.get( (answer, word), 0 ) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 if (log_term != 0): log_term = np.log10(log_term) list_of_ans[self.training_orig.get( answer, answer)] = list_of_ans.get( self.training_orig.get(answer, answer), 0) + (pv * log_term) # Return Weighted list of responses return list_of_ans
def __init__(self, path="Data/Dataset1.csv"): cr = csv.reader(open(path, "rb")) temp = [(row[1], row[0]) for row in cr] self.tag_list = nltk.data.load( 'taggers/maxent_treebank_pos_tagger/english.pickle') self.data_set_total = [] self.data_set_total = [(self.feature_extractor(ut.clean(n)), g) for (n, g) in temp] self.train_set = self.data_set_total self.train()
def __init__ (self,lambda_pi, training_set): self.lambda_pi = lambda_pi # Clean (Lower Case; Remove Punctuation) Training Set # Keep backup of formatted data as well self.training_orig = dict() self.training_set = [] for (ques, ans) in training_set: self.training_set.append((ut.clean(ques),ut.clean(ans))) self.training_orig[ut.clean(ans)] = ans self.unigram_dict = dict() self.bigram_dict = dict() self.trigram_dict = dict() self.unigram_tot_dict = dict() self.bigram_tot_dict = dict() self.trigram_tot_dict = dict() self.len = 0 self.train()
def data_prep(df_train, df_test, fast_text_embeddings): def extract_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') # cleaning the comment text df_train['comment_text'] = df_train['comment_text'].apply( lambda x: clean(x)) df_test['comment_text'] = df_test['comment_text'].apply(lambda x: clean(x)) # Handling the nan values train_features = df_train["comment_text"].fillna("fillna").values train_labels = df_train[Config.labels].values test_features = df_test["comment_text"].fillna("fillna").values # data preprocessing tokenizer = text.Tokenizer(num_words=Config.max_features) tokenizer.fit_on_texts(list(train_features) + list(test_features)) train_features = tokenizer.texts_to_sequences(train_features) test_features = tokenizer.texts_to_sequences(test_features) train_features = sequence.pad_sequences(train_features, maxlen=Config.max_sentence_len) test_features = sequence.pad_sequences(test_features, maxlen=Config.max_sentence_len) # creating the embedding matrix embeddings_index = dict( extract_coefs(*emb.rstrip().rsplit(' ')) for emb in open(fast_text_embeddings)) word_index = tokenizer.word_index nb_words = min(Config.max_features, len(word_index)) embedding_matrix = np.zeros((nb_words, Config.embedding_size)) for word, idx in word_index.items(): if idx >= Config.max_features: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[idx] = embedding_vector return train_features, train_labels, test_features, embedding_matrix
def __init__(self, lambda_phi, lambda_pi, training_set): """Constructor; Initialize all class attributes. Args: lambda_phi: Lambda Values to indicate the relative importance of individual Answers vs The entire Set. lambda_pi: Lambda Values to indicate relative importance of indv. words vs bigram words vs trigram words in both individual questions and the entire set. training_set : Training data for Classification [Tuple Format]. """ self.lambda_phi = lambda_phi self.lambda_pi = lambda_pi # Clean (Lower Case; Remove Punctuation) Training Set # Keep backup of formatted data as well self.training_orig = dict() self.training_set = [] for (ques, ans) in training_set: self.training_set.append((ut.clean(ques), ut.clean(ans))) self.training_orig[ut.clean(ans)] = ans # Initialize Maps for Phi and Pi, to learn their values self.pi_dict = dict() self.phi_dict = dict()
def get_classification(self, text): text = ut.clean(text) # Map to store answer to its divergence pairs list_of_ans = dict() # Calculate Denominator of Cond. Prob. P(v|W); v - Single word in the answer; W - User utterance # Calculate once, since its independent of indv. responses/ answers pv_den = 0.0 # Store PI_Ws(w_i) [i = 1...n] for each Ws for use in the numerator as well pi_prod = dict() # Iterate over all questions for (ques, ans) in self.training_set: for word in text.split(): pi_prod[ques] = pi_prod.get(ques , 1) * self.pi_dict.get((ques,word), 1) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 if(pi_prod[ques] == 0): break pv_den += pi_prod[ques] # Calculate metric for answer that can be given # Iterate over all answers in training_Set for (question, answer) in self.training_set: # Iterate over individual words in every answer for word in answer.split(): # Calculate numerator of Cond. Prob. P(v|W) pv_num = 0.0 for ques,ans in self.training_set: pv_num += self.phi_dict.get((ans, word), 0) * pi_prod[ques] # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 # Calculate pv for each word 'v' pv = pv_num / pv_den # In KL Divergence; log0 = 0 log_term = pv / self.phi_dict.get((answer,word), 0) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 if(log_term != 0): log_term = np.log10(log_term) list_of_ans[self.training_orig.get(answer, answer)] = list_of_ans.get(self.training_orig.get(answer, answer),0) + (pv * log_term) # Return Weighted list of responses return list_of_ans
def get_classification(self, text): text = ut.clean(text) uni = nltk.tokenize.word_tokenize(text) bi = nltk.bigrams(uni) tri = nltk.trigrams(uni) temp_lambda = self.lambda_pi # Map to store answer to its divergence pairs list_of_ans = dict() for (ques, ans) in self.training_set: fin_val = 0.0 for t in uni: fin_val += temp_lambda[5] * ( float(self.unigram_tot_dict.get(t, 0)) / self.len) fin_val += temp_lambda[4] * ( float(self.unigram_dict.get((ques, t), 0)) / len(ques)) for t in bi: fin_val += temp_lambda[3] * ( float(self.bigram_tot_dict.get(t, 0)) / self.unigram_tot_dict.get(t[:1], 1)) fin_val += temp_lambda[2] * ( float(self.bigram_dict.get( (ques, t), 0)) / self.unigram_dict.get( (ques, t[:1]), 1)) for t in tri: fin_val += temp_lambda[1] * ( float(self.trigram_tot_dict.get(t, 0)) / self.bigram_tot_dict.get(t[:2], 1)) fin_val += temp_lambda[0] * ( float(self.trigram_dict.get( (ques, t), 0)) / self.bigram_dict.get( (ques, t[:2]), 1)) list_of_ans[self.training_orig.get(ans, ans)] = fin_val # Return Weighted list of responses return list_of_ans
def costFunction (self, params): self.lambda_phi = params[0] self.lambda_pi = map(lambda x: x/sum(params[1:]), params[1:] ) #print self.lambda_pi # Train the classifier self.train() # Calculate Error error = 0.0 for (ques,ans) in self.training_set: res = self.get_classification(ques) pred_ans = ut.clean(ut.key_max_val_dict (res)) # Add one for every misclassified result if(pred_ans != ans): # print pred_ans, ans # raw_input() error += 1 return error
def feature_extractor(self,sentence): features = {} words = word_tokenize(ut.clean(sentence)) features["length"] = len(words) for q in ["how","where","when","what","why"]: features["has(%s)" % q] = (q in words) temp = [ (b) for (a,b) in pos_tag(words)] tag_dict = dict () for tag in temp: tag_dict [tag] = tag_dict.get(tag,0) + 1 for tags in self.tag_list.classifier().labels(): features["count(%s)" % tags] = tag_dict.get(tags,0) #features["has(%s)" % tags] = (tag_dict.get(tags,0) != 0) return features
def costFunction(self, params): self.lambda_pi = map(lambda x: x / sum(params), params) #print self.lambda_pi # Train the classifier self.train() # Calculate Error error = 0.0 for (ques, ans) in self.training_set: res = self.get_classification(ques) pred_ans = ut.clean(ut.key_max_val_dict(res)) # Add one for every misclassified result if (pred_ans != ans): # print pred_ans, ans # raw_input() error += 1 return error
def feature_extractor(self, sentence): features = {} words = word_tokenize(ut.clean(sentence)) features["length"] = len(words) for q in ["how", "where", "when", "what", "why"]: features["has(%s)" % q] = (q in words) temp = [(b) for (a, b) in pos_tag(words)] tag_dict = dict() for tag in temp: tag_dict[tag] = tag_dict.get(tag, 0) + 1 for tags in self.tag_list.classifier().labels(): features["count(%s)" % tags] = tag_dict.get(tags, 0) #features["has(%s)" % tags] = (tag_dict.get(tags,0) != 0) return features
def prep_dummy_csv(): from hashlib import sha256 # We need to hash customer emails as a semi-unique identifier def hash(inp): bytes=inp.encode('utf-8') h_e = sha256() h_e.update(bytes) return h_e.hexdigest()[:6] concat = u.concat('CSV_Files') to_dummy_csv = u.clean(concat) email_hashes = [hash(i) for i in to_dummy_csv['Email_Billing']] to_dummy_csv['Email_Billing'] = email_hashes to_dummy_csv[['Full_Name_Billing', 'Full_Name_Shipping', 'Address_1_Shipping']]='Restricted' to_dummy_csv.drop('Unnamed:_0', axis=1, inplace=True) to_dummy_csv['State_Name_Shipping'].loc[to_dummy_csv['State_Name_Shipping'].isna()] = 'Missing' to_dummy_csv['Discount_Amount'].loc[to_dummy_csv['Discount_Amount'].isna()] = to_dummy_csv['Discount_Amount'].mean() print(to_dummy_csv.isna().sum()) to_dummy_csv.dropna(inplace=True) to_dummy_csv= to_dummy_csv.loc[to_dummy_csv['Order_Date']<='2020-12-31-23:59'] print(to_dummy_csv.head(5)) print(to_dummy_csv.columns) print(to_dummy_csv.tail(5)) to_dummy_csv.to_csv('dummy.csv')
import trysearch as ts import AlchemyAPI as AP Dialog_Manager = DM.dialog_manager() q_class = QC.q_classification() var = 1 while var: print '\n' string = raw_input(" Enter the question: ") if string in ["end","End","exit","Exit"]: var = 0 else: temp = [ (a,b) for (a,b) in pos_tag(nltk.tokenize.word_tokenize(ut.clean(string)))] temp1 = dict() temp2 = '' for (a,b) in temp: if a == 'i': a = 'i' elif b == 'RB' or b == 'VB': temp2 += 'action ' temp1['action'] = a elif b == 'VBP' or b == 'NN': temp2 += 'object ' temp1['object'] = a else: temp2 += a + ' ' result_string = Dialog_Manager.get_reply(temp2) res_class = q_class.classify([string])
elif sys.argv[1] == 'pcp': frame = serve_frame('CSV_Files') dfpa = ma.ProductAnalysis(frame) dfpa.highest_positive_product_change_over_month_analysis() elif sys.argv[1] == 'pcp': frame = serve_frame('CSV_Files') dfpa = ma.ProductAnalysis(frame) dfpa.highest_negative_product_change_over_month_analysis() elif sys.argv[1] == 'plg': frame = serve_frame('CSV_Files') dfpa = ma.ProductAnalysis(frame) dfpa.product_line_change_over_month_graph() file = u.concat('CSV_FIles') # Test slice for functionality file = u.clean(file) cp1 = u.FieldLoader(file) u.date_difference() u.date_difference_email_list_compiler() #profiles_master_init() '''Work log: 2/25 functionality and indexing for time: reading, saving and analyzing time fields (completed?) Streamline initializations: maybe make a initialization function in Analysis so we can simply click on the py file and run the program **As of now customer_profile line 63 should protect from repeated inputs of sales information. This will need to be tested. ***Concat in Utility needs to be corrected! atm there is only one customer!! Best guess is recent addition--concat
def serve_frame(file): frame = u.concat(file) frame = u.clean(frame) return frame
def get_classification(self, text): """Classify 'text' based upon learned values in 'train', and return weighted list of responses. Args: text: The text (or Question, in this case) to be classified. Returns: A list (as a dict) of the weighted responses for the given 'text', The key being the response, and the value being the KL-Divergence. For Example, { ["This is the way"] => -0.0032, ["That is not true"] => -0.023, . . . } . """ # Clean the input (Remove Punctuation, Lower Case) text = ut.clean(text) # Map to store answer to its divergence pairs list_of_ans = dict() # Calculate Denominator of Cond. Prob. P(v|W); v - Single word in the answer; W - User utterance # Calculate once, since its independent of indv. responses/ answers pv_den = 0.0 # Store PI_Ws(w_i) [i = 1...n] for each Ws for use in the numerator as well pi_prod = dict() # Iterate over all questions for (ques, ans) in self.training_set: for word in text.split(): pi_prod[ques] = pi_prod.get(ques, 1) * self.pi_dict.get( (ques, word), 1 ) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 if pi_prod[ques] == 0: break pv_den += pi_prod[ques] # Calculate metric for answer that can be given # Iterate over all answers in training_Set for (question, answer) in self.training_set: # Iterate over individual words in every answer for word in answer.split(): # Calculate numerator of Cond. Prob. P(v|W) pv_num = 0.0 for ques, ans in self.training_set: pv_num += ( self.phi_dict.get((ans, word), 0) * pi_prod[ques] ) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 # Calculate pv for each word 'v' pv = pv_num / pv_den # By Definition of KL Divergence; log0 = 0 log_term = pv / self.phi_dict.get( (answer, word), 0 ) # Smoothing, if not available (i.e count = 0), make it 1 so that the rest doesnt become 0 if log_term != 0: log_term = np.log10(log_term) list_of_ans[self.training_orig.get(answer, answer)] = list_of_ans.get( self.training_orig.get(answer, answer), 0 ) + (pv * log_term) # Return Weighted list of responses return list_of_ans
Dialog_Manager = DM.dialog_manager() q_class = QC.q_classification() var = 1 while var: print '\n' string = raw_input(" Enter the question: ") if string in ["end", "End", "exit", "Exit"]: var = 0 else: temp = [ (a, b) for (a, b) in pos_tag(nltk.tokenize.word_tokenize(ut.clean(string))) ] temp1 = dict() temp2 = '' for (a, b) in temp: if a == 'i': a = 'i' elif b == 'RB' or b == 'VB': temp2 += 'action ' temp1['action'] = a elif b == 'VBP' or b == 'NN': temp2 += 'object ' temp1['object'] = a else: temp2 += a + ' ' result_string = Dialog_Manager.get_reply(temp2)