def load_utterance_dataset(): """ """ thread_data = gen_structured_data_from_utterances("../data/utterances.txt") print "load_utterance_dataset" sens = train_sen_flatten(thread_data) # Vectorize Init... vectorize = Vectorize() logging.info("training the sentence model") vectorize.train_google_model("../data/G.bin") logging.info("the google model Done!") # apply the word vector to the thread data for sen_dict in thread_data: # get max sen len max_sen_len = 0 len_of_sens = [len(v[1]) for k,v in sen_dict.items()] max_sen_len = max(len_of_sens) print max_sen_len for k, v in sen_dict.items(): sen_vec = np.zeros((len(v[1]), 300)) for i in xrange(len(v[1])): try: sen_vec[i] = vectorize.google_model[v[1][i]] except: continue v[2] = sen_vec # gen the train_x, train_y """ thread_x, thread_y = gen_structured_xy(thread_data) # pdb.set_trace() logging.info(" rnn start training") # prepare train_data and test_data data_x = [data for thread in thread_x for data in thread] data_y = [data for thread in thread_y for data in thread] TRAIN_SET = 200000 VALID_SET = 210000 print "data len: ", len(data_x) train_x = data_x[:TRAIN_SET] train_y = data_y[:TRAIN_SET] valid_x = data_x[TRAIN_SET:VALID_SET] valid_y = data_y[TRAIN_SET:VALID_SET] test_x = data_x[VALID_SET:] test_y = data_y[VALID_SET:] return train_x, train_y, valid_x, valid_y, test_x, test_y """ data_x, data_y = gen_structured_xy(thread_data) TRAIN_SET = 1000 VALID_SET = 1110 train_x = data_x[:TRAIN_SET] train_y = data_y[:TRAIN_SET] valid_x = data_x[TRAIN_SET:VALID_SET] valid_y = data_y[TRAIN_SET:VALID_SET] test_x = data_x[VALID_SET:] test_y = data_y[VALID_SET:] return train_x, train_y, valid_x, valid_y, test_x, test_y
def get_embedding_matrix_and_vectorizer(conversations): from vectorize import Vectorize vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE) word_index = vectorizer.word2idx # train_sequences = vectorizer.vectorize_data(conversations, MAX_SEQUENCE_LENGTH) vectorizer.save_tokenizer(TOKENIZER_PATH) from embedding import Embedding embed = Embedding(word_index) embedding_matrix = embed.get_embedding_matrix() return embedding_matrix, vectorizer
def create_atari_env(env_id): env = gym.make(env_id) env = Vectorize(env) env = AtariRescale42x42(env) env = DiagnosticsInfo(env) env = Unvectorize(env) return env
def check_embedding_quality(conversations): from vectorize import Vectorize vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE) word_index = vectorizer.word2idx from embedding import Embedding embed = Embedding(word_index) docu_vocab = vectorizer.word_counts embedding_vocab = embed.get_embedding_vocab() oov_words = embed.check_coverage(docu_vocab, embedding_vocab) print('Collected oov words.') return oov_words
def __init__(self, data_infile, fasttext_model_path): self.loss_weights = {'siamese': 1, 'grounding': 1} with open(data_infile, 'r') as f: self.clusters = json.load(f) self.lookup = {} for label, strings in self.clusters.items(): for string in strings: self.lookup[string] = label # embed all names all_names = set(self.lookup.keys()) self.vectorize = Vectorize(fasttext_model_path) self.pretrained_name_embeddings = self.vectorize.create_reach_object( all_names) self.amount_negative_names = 1 self.triplet_margin = 0.1 self.anchor_margin = 0 torch.autograd.set_detect_anomaly(True)
def Learn(): logreg = SGDClassifier(loss='log', n_jobs=-1) data = read_csv('myds.csv', delimiter=';') print( "Датасет загружен в память\nПриводим загруженные данные к необходимому виду..." ) X, vectorw, vectorc = Vectorize(data.Sentence) le = LabelEncoder() y = le.fit_transform(data.Category) print("Начинаем процесс обучения...") X_train, X_test, y_train, y_test = train(X, y) logreg.fit(X_train, y_train) print("Обучение закончено") dump(logreg, 'model.joblib') dump(vectorw, 'vectorw.joblib') dump(vectorc, 'vectorc.joblib') print("Дамп модели сохранен в файл 'model.joblib'")
def format_vars(self): field_var = [x.get() for x in self.field_var] segment_var = self.segment_var.get() wh_var = self.wh_var.get() region_var = self.region_var.get() level_var = self.level_var.get() obj_var = self.objective.get() natl_acct_var = self.natl_acct.get() try: wh_var = [int(wh_var)] except ValueError: assert wh_var == 'All' wh_var = self.df['legacy_division_cd'].unique() try: region_var = [int(region_var)] except ValueError: assert region_var == 'All' region_var = self.df['legacy_system_cd'].unique() params = [ obj_var, segment_var, field_var, natl_acct_var, self.field_options, self.cutoff, self.weights, self.df, self.fname ] if level_var == 'warehouse': self.model = Vectorize(level_var, wh_var, *params) elif level_var == 'region': self.model = Vectorize(level_var, region_var, *params) else: self.model = Vectorize('region', 'All', *params)
class GUI: def __init__(self): self.root = Tk() self.home() self.root.geometry("500x200") def home(self): # self.root.eval('tk::PlaceWindow %s center' % self.root.winfo_pathname(self.root.winfo_id())) self.root.title("Click UPLOAD to begin") self.root.config(bg="white") frame = Frame(self.root) Button(frame, text="UPLOAD", width=8, command=self.get_filename).grid(column=1, row=3, sticky=W) Button(frame, text="QUIT", width=8, command=self.root.destroy).grid(column=2, row=3, sticky=W) frame.grid(row=3, column=2) self.root.mainloop() def get_filename(self): try: self.root.withdraw() self.loading.withdraw() except: pass from tkinter.filedialog import askopenfilename self.fname = askopenfilename() if self.fname is tuple(): self.home() else: self.loading1() def loading1(self): loading = Toplevel() self.loading = loading # self.root.eval('tk::PlaceWindow %s center' % self.loading.winfo_pathname(self.loading.winfo_id())) loading.title("Loading...") loading.config(bg="white") frame = Frame(loading) Label(frame, text='Loading...').grid(row=2, column=2, pady=100, padx=100) frame.grid(row=4, column=4) self.loading.after(200, self.check_read) loading.mainloop() def check_read(self): failed_read = self.read_file() if failed_read is True: self.get_filename() else: failed_set_var = self.set_vars() if failed_set_var is True: self.get_filename() else: self.input_page() def read_file(self): self.ext = self.fname.split('.')[1] if self.ext == 'xlsx': try: self.df = pd.read_excel(self.fname, sheet_name='Sheet1') except: self.df = pd.read_excel(self.fname) elif self.ext == '.csv': self.df = pd.read_csv(self.fname) else: return 'File extension not valid. Select another file.' self.df.columns = [ c.replace(' ', '_').lower() for c in self.df.columns ] try: self.df = self.df[self.df['sales_channel'] == 'Warehouse'] except KeyError: pass keep = [ 'legacy_division_cd', 'legacy_system_cd', 'segment', 'legacy_product_cd', 'sales_channel', 'sales_6_mos', 'cogs_6mos', 'qty_6mos', 'picks_6mos', 'net_oh', 'legacy_customer_cd', 'core_item_flag', 'margin_%', 'net_oh_$', 'dioh', 'national_acct_flag', 'item_poi_days', ] self.df = self.df[keep].fillna(0).replace('-', 0).replace('No Venloc', 0) def set_vars(self): options = [ 'sales_6_mos', 'cogs_6mos', 'qty_6mos', 'picks_6mos', 'net_oh_$_6mos', # 'pallet_quantity', 'margin_%' ] try: self.field_options = ['turn_6mos', 'profit_6mos', 'customers_per_product'] + \ [x for x in self.df.columns if x in options] except AttributeError: self.loading.withdraw() self.get_filename() wh_options = self.df.legacy_division_cd.unique().astype(str) if wh_options is None: return 'no warehouses' self.segment_options = self.df.segment.unique() if self.segment_options is None: return 'no segments' self.region_options = self.df['legacy_system_cd'].unique() if self.region_options is None: return 'no regions' self.level_options = ['warehouse', 'region', 'enterprise'] self.region_options = np.append(['All'], self.region_options) self.wh_options = np.append(['All'], wh_options) self.wh_var = StringVar(self.root, value='All') self.segment_var = StringVar(self.root, value=self.segment_options[0]) self.cutoff_var = StringVar(self.root, value='20') self.field_var = [] self.weight_var = [ StringVar(self.root, value='33.3'), StringVar(self.root, value='33.3'), StringVar(self.root, value='33.3') ] self.region_var = StringVar(self.root, value='All') self.level_var = StringVar(self.root, value='warehouse') self.objective = StringVar(self.root, value='Identify core products') self.natl_acct = IntVar(self.root, value=1) def input_page(self): self.loading.withdraw() try: # if tool has already done one analysis and needs to do another self.outputs.withdraw() except: pass define = Toplevel() self.define = define # self.root.eval('tk::PlaceWindow %s center' % self.define.winfo_pathname(self.define.winfo_id())) define.title("Define inputs") define.config(bg="white") frame = Frame(define) self.input_frame = frame Label(frame, text="Modify model inputs below and click RUN. ").grid(row=0, column=1, pady=10) Button(frame, text="RUN", width=8, command=self.check_inputs).grid(row=0, column=2, pady=10) Label(frame, text="Select model goal: ").grid(row=1, column=0, pady=10) OptionMenu(frame, self.objective, 'Identify core products', 'Identify products to remove')\ .grid(row=1, column=1, pady=10) Label(frame, text="Select segment: ").grid(row=2, column=0, pady=10) OptionMenu(frame, self.segment_var, *self.segment_options).grid(row=2, column=1, pady=10) Label(frame, text="Exclude products ordered by national account(s)? ").grid( row=3, column=0, pady=10) Checkbutton(frame, text='', variable=self.natl_acct).grid(row=3, column=1) Label(frame, text="Select scope level and press REFRESH: ").grid(row=4, column=0, pady=10) OptionMenu(frame, self.level_var, *self.level_options).grid(row=4, column=1, pady=10) Button(frame, text='REFRESH', command=self.popup_level_options).grid(row=4, column=2, pady=10) Label(frame, text="Set % to identify: ").grid(row=5, column=0, pady=10) Entry(frame, textvariable=self.cutoff_var).grid(row=5, column=1, pady=10) Label(frame, text="Select field(s) to consider and enter weights: ").grid( row=6, column=0, pady=10) self.btns = [] self.entries = [] self.rows = [] preselected = list( compress(list(range(len(self.field_options))), [float(x.get()) for x in self.weight_var])) for idx in range(len(self.field_options)): if idx in preselected: var = IntVar(self.root, value=1) weightvar = StringVar(self.root, value=33.33) else: var = IntVar(self.root) weightvar = IntVar(self.root, value=0) self.weight_var.append(weightvar) self.field_var.append(var) txt = self.field_options[idx] btn = Checkbutton(frame, text=txt, variable=self.field_var[idx]) btn.grid(row=6 + idx, column=1, pady=10) entry = Entry(frame, text=self.weight_var[idx].get(), textvariable=self.weight_var[idx]) entry.grid(row=6 + idx, column=2, pady=10, padx=10) self.btns += [btn] self.entries += [entry] self.rows += [6 + idx] self.last_row = 6 + len(self.field_options) Button(frame, text="Set equal weights among checked fields", command=self.reset_weights)\ .grid(row=6, column=3, pady=10) frame.grid(row=4, column=len(self.field_options)) define.mainloop() def popup_level_options(self): try: self.wh_label.destroy() self.wh_optionmenu.destroy() except AttributeError: pass try: self.region_label.destroy() self.region_optionmenu.destroy() except AttributeError: pass if self.level_var.get() == 'warehouse': self.wh_label = Label(self.input_frame, text="Select warehouse(s): ") self.wh_label.grid(row=4, column=3, pady=10, padx=10) self.wh_optionmenu = OptionMenu(self.input_frame, self.wh_var, *self.wh_options) self.wh_optionmenu.grid(row=4, column=4, pady=10) elif self.level_var.get() == 'region': self.region_label = Label(self.input_frame, text="Select region(s): ") self.region_label.grid(row=4, column=3, pady=10) self.region_optionmenu = OptionMenu(self.input_frame, self.region_var, *self.region_options) self.region_optionmenu.grid(row=4, column=4, pady=10) def reset_weights(self): total = sum([x.get() for x in self.field_var]) for x in range(len(self.field_var)): if self.field_var[x].get() == 1: self.entries[x].destroy() self.weight_var[x] = StringVar(self.root, value=round(100 / total, 2)) self.entries[x] = Entry(self.input_frame, text=self.weight_var[x].get(), textvariable=self.weight_var[x]) self.entries[x].grid(row=self.rows[x], column=2, pady=10, padx=10) else: self.entries[x].destroy() self.weight_var[x] = StringVar(self.root, value=0) self.entries[x] = Entry(self.input_frame, text=self.weight_var[x].get(), textvariable=self.weight_var[x]) self.entries[x].grid(row=self.rows[x], column=2, pady=10, padx=10) def check_inputs(self): try: self.ErrorLabel.destroy() except AttributeError: pass err = '' try: self.cutoff = float(self.cutoff_var.get()) except ValueError: err = 'ERROR. Enter a numeric value for % core products.' try: self.weights = [float(w.get()) for w in self.weight_var] total = sum(self.weights) assert 99 <= total <= 101 except TclError: err = 'ERROR. Enter a numeric value for each of the field weights.' except AssertionError: err = 'ERROR. Sum of field weights must equal 100.' if err: self.ErrorLabel = Label(self.input_frame, text=err).grid(row=self.last_row + 1, column=0) return else: self.format_vars() self.loading_page2() def format_vars(self): field_var = [x.get() for x in self.field_var] segment_var = self.segment_var.get() wh_var = self.wh_var.get() region_var = self.region_var.get() level_var = self.level_var.get() obj_var = self.objective.get() natl_acct_var = self.natl_acct.get() try: wh_var = [int(wh_var)] except ValueError: assert wh_var == 'All' wh_var = self.df['legacy_division_cd'].unique() try: region_var = [int(region_var)] except ValueError: assert region_var == 'All' region_var = self.df['legacy_system_cd'].unique() params = [ obj_var, segment_var, field_var, natl_acct_var, self.field_options, self.cutoff, self.weights, self.df, self.fname ] if level_var == 'warehouse': self.model = Vectorize(level_var, wh_var, *params) elif level_var == 'region': self.model = Vectorize(level_var, region_var, *params) else: self.model = Vectorize('region', 'All', *params) def loading_page2(self): self.define.withdraw() loading2 = Toplevel() self.loading2 = loading2 # self.root.eval('tk::PlaceWindow %s center' % self.loading2.winfo_pathname(self.loading2.winfo_id())) loading2.title("Loading...") loading2.config(bg="white") frame = Frame(loading2) Label(frame, text='Loading...').grid(row=2, column=2, pady=100, padx=100) frame.grid(row=4, column=4) self.loading2.after(200, self.move_on) loading2.mainloop() def move_on(self): self.model.run() self.output_page() def output_page(self): self.redo = True self.loading2.withdraw() outputs = Toplevel() self.outputs = outputs # self.root.eval('tk::PlaceWindow %s center' % self.outputs.winfo_pathname(self.outputs.winfo_id())) outputs.title("Outputs") outputs.config(bg="white") self.output_frame = Frame(outputs) # Label(self.output_frame, text="Success!").grid(row=0, column=0, pady=10) Button(self.output_frame, text="Export to Excel", command=self.export).grid(row=1, column=0, pady=10) Button(self.output_frame, text="Rerun with new parameters", command=self.input_page).grid(row=2, column=0, pady=10) text = Text(self.output_frame) text.grid(row=3, column=0, pady=0) text.insert(INSERT, self.model.string_output()) self.output_frame.grid(row=3, column=1) # self.output_frame.grid_columnconfigure(0, weight=1) # self.output_frame.grid_columnconfigure(1, weight=1) # self.output_frame.grid_rowconfigure(0, weight=1) # self.output_frame.grid_rowconfigure(1, weight=1) # self.output_frame.grid_rowconfigure(2, weight=1) # self.output_frame.grid_rowconfigure(3, weight=1) outputs.mainloop() def export(self): from tkinter.filedialog import asksaveasfilename addon = '_' + self.level_var.get() addon += '_new_core' if self.objective.get( ) == 'Identify core products' else '_rationalized' newfname = self.fname.split('/')[-1][:-len(self.ext) - 1] + addon fout = asksaveasfilename(initialdir=''.join( self.fname.split('/')[:-1]), initialfile=newfname, filetypes=[('Excel spreadsheet', '.xlsx')]) if fout is '': return else: self.model.df.to_excel(fout) Label(self.output_frame, text="Exported successfully!").grid(row=0, column=0, pady=10)
class EncoderBase: def __init__(self, data_infile, fasttext_model_path): self.loss_weights = {'siamese': 1, 'grounding': 1} with open(data_infile, 'r') as f: self.clusters = json.load(f) self.lookup = {} for label, strings in self.clusters.items(): for string in strings: self.lookup[string] = label # embed all names all_names = set(self.lookup.keys()) self.vectorize = Vectorize(fasttext_model_path) self.pretrained_name_embeddings = self.vectorize.create_reach_object( all_names) self.amount_negative_names = 1 self.triplet_margin = 0.1 self.anchor_margin = 0 torch.autograd.set_detect_anomaly(True) def preprocess(self, name): return ' '.join(tokenize(name)).lower() def triplet_loss(self, positive_distance, negative_similarity, override_margin=False, new_margin=0): if override_margin: triplet_margin = new_margin else: triplet_margin = self.triplet_margin triplet_loss = positive_distance - negative_similarity + triplet_margin triplet_loss = F.relu(triplet_loss) return triplet_loss def positive_distance(self, anchor_batch, positive_batch): # take the cosine similarity of the outputted reference and synonym embedding ref = anchor_batch / anchor_batch.norm(dim=1).reshape(-1, 1) syn = positive_batch / positive_batch.norm(dim=1).reshape(-1, 1) dot_products = torch.stack([ torch.mm(x.reshape(1, -1), y.reshape(1, -1).t()) for x, y in zip(ref, syn) ], dim=0) dot_product = torch.mean(dot_products) positive_distance = 1 - dot_product return positive_distance def negative_distance(self, anchor_batch, negatives_batch, override_amount_negative=0): if override_amount_negative: amount_negative = override_amount_negative else: amount_negative = self.amount_negative_names # take the negative dot product of the outputted reference and negatives embeddings reference_batch = anchor_batch.reshape(-1, 1, negatives_batch.shape[-1]) ref = reference_batch / reference_batch.norm(dim=2).reshape(-1, 1, 1) neg = negatives_batch / negatives_batch.norm(dim=2).reshape( -1, amount_negative, 1) dot_products = [] for x, y in zip(ref, neg): dot_product = torch.mm(x, y.t()) # apply accumulation strategy for single instance accumulated_dot_product = dot_product.mean() dot_products.append(accumulated_dot_product) dot_products = torch.stack(dot_products, dim=0) # extract single loss value for entire batch dot_product = torch.mean(dot_products) negative_distance = 1 - dot_product return negative_distance def batch_cosines(self, anchor_batch, distance_batch): ref = anchor_batch / anchor_batch.norm(dim=1).reshape(-1, 1) dist = distance_batch / distance_batch.norm(dim=1).reshape(-1, 1) dot_products = [] for x, y in zip(ref, dist): dot_product = torch.mm(x.reshape(1, -1), y.reshape(1, -1).t()) dot_products.append(dot_product) dot_products = torch.stack(dot_products, dim=0) return dot_products def pretrained_loss(self, online_batch, pretrained_batch): # take the dot product of the outputted reference and original embedding online = online_batch / online_batch.norm(dim=1).reshape(-1, 1) pretrained = pretrained_batch / pretrained_batch.norm(dim=1).reshape( -1, 1) dot_products = torch.stack([ torch.mm(x.reshape(1, -1), y.reshape(1, -1).t()) for x, y in zip(online, pretrained) ], dim=0) dot_product = torch.mean(dot_products) pretrained_loss = 1 - dot_product + self.anchor_margin pretrained_loss = F.relu(pretrained_loss) return pretrained_loss