def __init__(self, vocab_size: int, head_num: int = 8, hidden_dim: int = 512, dropout_rate: float = 0.1, max_len: int = 50, *args, **kwargs): super().__init__(*args, **kwargs) self.vocab_size = vocab_size self.head_num = head_num self.hidden_dim = hidden_dim self.dropout_rate = dropout_rate self.max_len = max_len # Encoder側埋め込み層 self.enc_embedding = WordEmbedding(vocab_size=vocab_size, embedding_dim=hidden_dim) # Encoder self.encoder = Encoder(vocab_size=vocab_size, hidden_dim=hidden_dim, dropout_rate=dropout_rate) # Decoder側埋め込み層 self.dec_embedding = WordEmbedding(vocab_size=vocab_size, embedding_dim=hidden_dim) # Decoder self.decoder = Decoder(vocab_size=vocab_size, hidden_dim=hidden_dim, dropout_rate=dropout_rate)
def create_fasttext_model(corpus_file, method='cbow', out_file=None, **kwargs): # type: (Path, str, Path, **Any) -> WordEmbedding """Load or create a FastText word embedding. Parameters: corpus_file (Path): The path of the corpus file. method (str): The model type. Must be either 'cbow' or 'skipgram'. out_file (Path): The output path of the model. Optional. **kwargs: Other keyword arguments. Returns: WordEmbedding: The trained FastText model. Raises: ValueError: If method is not 'cbow' or 'skipgram'. """ if method not in {'cbow', 'skipgram'}: raise ValueError(f'method must be "cbow" or "skipgram" but got "{method}"') if out_file is None: out_file = MODELS_PATH.joinpath(corpus_file.name + f'.fasttext.{method}') if not out_file.exists(): binary_file = out_file.parent.joinpath(out_file.name + '.bin') if not binary_file.exists(): subprocess.run( [ 'fasttext', method, '-input', str(corpus_file), '-output', str(out_file), ], check=True, ) embedding = WordEmbedding.load_fasttext_file(binary_file) embedding.save(out_file) return WordEmbedding.load_word2vec_file(out_file)
def main(_): # init we = WordEmbedding() dc = Document() cf = Classifier() # load data docs = dc.getDocs(labeled_only=True) # load word embedding model if FLAGS.we_model == 'devblog': we_model = we.loadDevblogModel(embedding_dim = FLAGS.we_dim, epochs = FLAGS.we_epoch, window = FLAGS.we_window, min_count = FLAGS.we_min_count) # han2jamo docs.text = docs.text.apply(han2Jamo) elif FLAGS.we_model == 'wiki': we_model = we.loadWikiModel() # word embedding docs.vector = docs.text.apply(lambda x: we.embedding(we_model, x)) # training cf_model = cf.train(docs, './checkpoint') cf.saveModel(cf_model, FLAGS.cf_model)
def __init__(self, word_emb, N_word, N_h=300, N_depth=2, gpu=True, trainable_emb=False, table_type="std", use_hs=True): super(SuperModel, self).__init__() self.gpu = gpu self.N_h = N_h self.N_depth = N_depth self.trainable_emb = trainable_emb self.table_type = table_type self.use_hs = use_hs self.SQL_TOK = ['<UNK>', '<END>', 'WHERE', 'AND', 'EQL', 'GT', 'LT', '<BEG>'] # word embedding layer self.embed_layer = WordEmbedding(word_emb, N_word, gpu, self.SQL_TOK, trainable=trainable_emb) # initial all modules self.multi_sql = MultiSqlPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs) self.multi_sql.eval() self.key_word = KeyWordPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.key_word.eval() self.col = ColPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs) self.col.eval() self.op = OpPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs) self.op.eval() self.agg = AggPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs) self.agg.eval() self.root_teminal = RootTeminalPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs) self.root_teminal.eval() self.des_asc = DesAscLimitPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs) self.des_asc.eval() self.having = HavingPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs) self.having.eval() self.andor = AndOrPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.andor.eval() self.softmax = nn.Softmax(dim = 1) #dim=1 self.CE = nn.CrossEntropyLoss() self.log_softmax = nn.LogSoftmax() self.mlsml = nn.MultiLabelSoftMarginLoss() self.bce_logit = nn.BCEWithLogitsLoss() self.sigm = nn.Sigmoid() if gpu: self.cuda() self.path_not_found = 0
def main(url="None"): # We create an instance of the word embedding wemb = WordEmbedding() # We define a data dataset = None # We will open the file(s) with open(url) as json_file: dataset = json.load(json_file) for data in dataset: d = data.get("text", "") print(d) # We get the words in the sentences words = d.split() # We get a dictionary to relate the words to their most similars embedded_text = "" # We iterate for each word and we get the word embedding for w in words: # We check if the word embedding produces results. # similars_list = wemb.get_most_similars(w) # print(similars_list) try: similars_list = wemb.get_most_similars(w) # We sort the list sorted_list = Sort(similars_list) np_array = np.array(sorted_list) embedded_text += np_array[0, 0] + " " except: # We concatenate the original word in case we coudln't find it in the word embedding embedded_text += w + " " data['embedded_text'] = embedded_text print("We have ended searching the words") # new URL new_url = new_url = url.split("/")[1].split(".")[0] # We create a new file for each hash tag file that we consulted. with open('Embedded_Results/' + new_url + "_embedded" + '.json', 'w') as outfile: # We finally dump the tweets + the overall_score in a json file. json.dump(dataset, outfile) # w_embedding.run() print("Exiting main")
def __init__(self, num_docs, vocab_size, num_topics, embedding_size, freqs, batch_size, save_graph, num_sampled=40): self.num_docs = num_docs self.vocab_size = vocab_size self.num_topics = num_topics self.embedding_size = embedding_size self.freqs = freqs self.batch_size = batch_size self.save_graph = save_graph self.num_sampled = num_sampled self.lmbda = 200.0 self.learning_rate = 0.001 self.moving_avgs = tf.train.ExponentialMovingAverage(0.9) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.sesh = tf.Session(config=self.config) self.computed_norm = False self.logdir = "_".join( ("lda2vec", datetime.now().strftime('%y%m%d_%H%M'))) self.w_embed = WordEmbedding(self.embedding_size, self.vocab_size, self.num_sampled, freqs=self.freqs) self.mixture = EmbeddingMixture(self.num_docs, self.num_topics, self.embedding_size) handles = self.retrieve_variables() (self.x, self.y, self.docs, self.step, self.switch_loss, self.word_context, self.doc_context, self.loss_word2vec, self.fraction, self.loss_lda, self.loss, self.loss_avgs_op, self.optimizer, self.merged) = handles
def run(dim, epochs): cooc = _load_cooc_matrix() vocab = _load_vocab() print(vocab) word_vectors, _ = _train_embeddings(cooc, dim=dim, epochs=epochs) word_emb = WordEmbedding(word_vectors, vocab) # Save results to file path_word_emb = constants.GLOVE_EMBEDDING_CIL_PATH pickle.dump(word_emb, open(path_word_emb, "wb")) print("Finished saving embeddings at %s" % path_word_emb)
def _load_word2vec_embedding(): ''' Loads the word2vec embedding using a binary file. The word2vec embeddings are very large and cannot be pickled as a WordEmbedding object. ''' print("Loading word2vec embeddings, this may take a while...") w2v_model = gensim\ .models.KeyedVectors\ .load_word2vec_format(constants.WORD2VEC_EMBEDDING_PATH, binary=True) vocab = {k: i for i, k in enumerate(w2v_model.vocab)} return WordEmbedding(w2v_model.vectors, vocab)
def __init__(self, word_emb, num_words, num_hidden=100, num_layers=2, use_gpu=True): super(Seq2SQL, self).__init__() self.word_emb = word_emb self.num_words = num_words self.num_hidden = num_hidden self.num_layers = num_layers self.use_gpu = use_gpu self.max_col_num = 45 self.max_tok_num = 200 self.COND_OPS = ['EQL', 'GT', 'LT'] self.SQL_TOK = ['<UNK>', '<BEG>', '<END>', 'WHERE', 'AND' ] + self.COND_OPS # GloVe Word Embedding self.embed_layer = WordEmbedding(word_emb, num_words, self.SQL_TOK, use_gpu) # Aggregation Classifier self.agg_classifier = AggregationClassifier(num_words, num_hidden, num_layers) # SELECT Column(s) self.sel_classifier = SelectClassifier(num_words, num_hidden, num_layers, self.max_tok_num) # WHERE Clause self.whr_classifier = WhereClassifier(num_words, num_hidden, num_layers, self.max_col_num, self.max_tok_num, use_gpu) # run on GPU if use_gpu: self.cuda()
def bolukbasi_debias_generalized(embedding, words, out_file, excludes=None, **kwargs): # type: (WordEmbedding, Iterable[str], Path, Iterable[str], **Any) -> WordEmbedding """Debias a word embedding using a generalized version of Bolukbasi's algorithm. Parameters: embedding (WordEmbedding): The word embedding to debias. words (Iterable[str]): A list of words that define the bias subspace. out_file (Path): The path to save the new embedding to. excludes (Iterable[str]): A collection of words to be excluded from the debiasing **kwargs: Other keyword arguments. Returns: WordEmbedding: The debiased word embedding. """ if out_file.exists(): return WordEmbedding.load_word2vec_file(out_file) matrix = recenter( np.array([embedding[word] for word in words if word in embedding])) bias_subspace = _define_pca_bias_subspace(matrix, **kwargs) bias_subspace = bias_subspace[np.newaxis, :] # debias by rejecting the subspace and reverting the excluded words if excludes is None: excludes = set() new_vectors = reject(embedding.vectors, bias_subspace) for word in excludes: if word in embedding: new_vectors[embedding.index(word)] = embedding[word] new_vectors = normalize(new_vectors) # create a word embedding from the new vectors new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors) new_embedding.source = out_file new_embedding.save() return new_embedding
def main(_): # init we = WordEmbedding() dc = Document() cf = Classifier() # load word embedding model if FLAGS.we_model == 'devblog': we_model = we.loadDevblogModel(embedding_dim=FLAGS.we_dim, epochs=FLAGS.we_epoch, window=FLAGS.we_window, min_count=FLAGS.we_min_count) elif FLAGS.we_model == 'wiki': we_model = we.loadWikiModel() # load classifier model cf_model = cf.loadModel(FLAGS.cf_model) results = [{'text': r} for r in FLAGS.predict] is_devblog = FLAGS.we_model == 'devblog' for i, r in enumerate(FLAGS.predict): # preprocessing text = han2Jamo(r) if is_devblog else r # word embedding df = dc.preprocessing(text, devblog=is_devblog) vector = df.text.apply( lambda x: we.embedding(we_model, x, FLAGS.we_dim)).tolist() if len(vector) == 0: print('🐈 text is not valid :', r) return else: # predict results[i]['predict'] = cf.predict(cf_model, np.array(vector), FLAGS.criterion) return results
def adapt_embed(path, bin_path, embed_path, strategy, source_lang, target_lang): # process enconder [MA] embed_encoder_path = os.path.join(path, 'tmp', 'generated_embeds_encoder.txt') word_embedding = WordEmbedding( os.path.join(os.path.abspath(embed_path), 'best_embeds_encoder.txt'), os.path.join(os.path.abspath(bin_path), f"dict.{source_lang}.txt")) word_embedding.process_embed(strategy, embed_encoder_path) # process decoder [MA] embed_decoder_path = os.path.join(path, 'tmp', 'generated_embeds_decoder.txt') word_embedding = WordEmbedding( os.path.join(os.path.abspath(embed_path), 'best_embeds_decoder.txt'), os.path.join(os.path.abspath(bin_path), f"dict.{target_lang}.txt")) word_embedding.process_embed(strategy, embed_decoder_path) return embed_encoder_path, embed_decoder_path
def main(self): ''' This is the main function for performing the Document Clustering. ''' # Create object of ConfigParser Class config_obj = ConfigParse() # Parse config file print 'READING CONFIG FILE' config_obj.config_reader() # Create object of WordEmbedding Class word_embedding_obj = WordEmbedding(config_obj.input_file_path, config_obj.word2vec_model, config_obj.word_vector_dim) print 'CONVERTING INPUT SENTENCES TO VECTORS' embedding_file = word_embedding_obj.sentence_to_vector() # Create object of Clustering Class clustering_obj = Clustering(embedding_file, config_obj.output_dir_path, config_obj.threshold, config_obj.representative_word_vector, config_obj.cluster_overlap, config_obj.word_vector_dim) print 'CLUSTERING SENTENCES' num_of_clusters = clustering_obj.cluster_sentences() print str(num_of_clusters) + ' NUMBER OF CLUSTERS ARE GENERATED.' # Remove Temporary Files os.remove(embedding_file) for subdir, dirs, cluster_files in os.walk(config_obj.output_dir_path): for cluster_file in cluster_files: if 'rep_' in cluster_file: os.remove(config_obj.output_dir_path + '/' + cluster_file)
use_hs=use_hs) model.load_state_dict( torch.load("{}/andor_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) # model = SQLNet(word_emb, N_word=N_word, gpu=GPU, trainable_emb=args.train_emb) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0) print("finished building model") print_flag = False embed_layer = WordEmbedding(word_emb, N_word, gpu=GPU, SQL_TOK=SQL_TOK, trainable=args.train_emb) print("Dev Accuracy") # best_acc = 0.0 # for i in range(args.epoch): # print('Epoch %d @ %s'%(i+1, datetime.datetime.now())) # arguments of epoch_train # model, optimizer, batch_size, component,embed_layer,data, table_type # print(' Loss = %s' % epoch_train( # model, optimizer, BATCH_SIZE, # args.train_component, # embed_layer, # train_data, # table_type=args.table_type))
class SuperModel(nn.Module): def __init__(self, word_emb, N_word, N_h=300, N_depth=2, gpu=True, trainable_emb=False, table_type="std", use_hs=True): super(SuperModel, self).__init__() self.gpu = gpu self.N_h = N_h self.N_depth = N_depth self.trainable_emb = trainable_emb self.table_type = table_type self.use_hs = use_hs self.SQL_TOK = [ '<UNK>', '<END>', 'WHERE', 'AND', 'EQL', 'GT', 'LT', '<BEG>' ] # word embedding layer self.embed_layer = WordEmbedding(word_emb, N_word, gpu, self.SQL_TOK, trainable=trainable_emb) # initial all modules self.multi_sql = MultiSqlPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.multi_sql.eval() self.key_word = KeyWordPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.key_word.eval() self.col = ColPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.col.eval() self.op = OpPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.op.eval() self.agg = AggPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.agg.eval() self.root_teminal = RootTeminalPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.root_teminal.eval() self.des_asc = DesAscLimitPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.des_asc.eval() self.having = HavingPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.having.eval() self.andor = AndOrPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs) self.andor.eval() self.softmax = nn.Softmax() #dim=1 self.CE = nn.CrossEntropyLoss() self.log_softmax = nn.LogSoftmax() self.mlsml = nn.MultiLabelSoftMarginLoss() self.bce_logit = nn.BCEWithLogitsLoss() self.sigm = nn.Sigmoid() if gpu: self.cuda() self.path_not_found = 0 def forward(self, q_seq, history, tables): # if self.part: # return self.part_forward(q_seq,history,tables) # else: return self.full_forward(q_seq, history, tables) def full_forward(self, q_seq, history, tables): B = len(q_seq) # print("q_seq:{}".format(q_seq)) # print("Batch size:{}".format(B)) q_emb_var, q_len = self.embed_layer.gen_x_q_batch(q_seq) col_seq = to_batch_tables(tables, B, self.table_type) col_emb_var, col_name_len, col_len = self.embed_layer.gen_col_batch( col_seq) mkw_emb_var = self.embed_layer.gen_word_list_embedding( ["none", "except", "intersect", "union"], (B)) mkw_len = np.full(q_len.shape, 4, dtype=np.int64) kw_emb_var = self.embed_layer.gen_word_list_embedding( ["where", "group by", "order by"], (B)) kw_len = np.full(q_len.shape, 3, dtype=np.int64) stack = Stack() stack.push(("root", None)) history = [["root"]] * B andor_cond = "" has_limit = False # sql = {} current_sql = {} sql_stack = [] idx_stack = [] kw_stack = [] kw = "" nested_label = "" has_having = False timeout = time.time( ) + 2 # set timer to prevent infinite recursion in SQL generation failed = False while not stack.isEmpty(): if time.time() > timeout: failed = True break vet = stack.pop() # print(vet) hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch(history) if len(idx_stack) > 0 and stack.size() < idx_stack[-1]: # print("pop!!!!!!!!!!!!!!!!!!!!!!") idx_stack.pop() current_sql = sql_stack.pop() kw = kw_stack.pop() # current_sql = current_sql["sql"] # history.append(vet) # print("hs_emb:{} hs_len:{}".format(hs_emb_var.size(),hs_len.size())) if isinstance(vet, tuple) and vet[0] == "root": if history[0][-1] != "root": history[0].append("root") hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch( history) if vet[1] != "original": idx_stack.append(stack.size()) sql_stack.append(current_sql) kw_stack.append(kw) else: idx_stack.append(stack.size()) sql_stack.append(sql_stack[-1]) kw_stack.append(kw) if "sql" in current_sql: current_sql["nested_sql"] = {} current_sql["nested_label"] = nested_label current_sql = current_sql["nested_sql"] elif isinstance(vet[1], dict): vet[1]["sql"] = {} current_sql = vet[1]["sql"] elif vet[1] != "original": current_sql["sql"] = {} current_sql = current_sql["sql"] # print("q_emb_var:{} hs_emb_var:{} mkw_emb_var:{}".format(q_emb_var.size(),hs_emb_var.size(),mkw_emb_var.size())) if vet[1] == "nested" or vet[1] == "original": stack.push("none") history[0].append("none") else: score = self.multi_sql.forward(q_emb_var, q_len, hs_emb_var, hs_len, mkw_emb_var, mkw_len) label = np.argmax(score[0].data.cpu().numpy()) label = SQL_OPS[label] history[0].append(label) stack.push(label) if label != "none": nested_label = label elif vet in ('intersect', 'except', 'union'): stack.push(("root", "nested")) stack.push(("root", "original")) # history[0].append("root") elif vet == "none": score = self.key_word.forward(q_emb_var, q_len, hs_emb_var, hs_len, kw_emb_var, kw_len) kw_num_score, kw_score = [x.data.cpu().numpy() for x in score] # print("kw_num_score:{}".format(kw_num_score)) # print("kw_score:{}".format(kw_score)) num_kw = np.argmax(kw_num_score[0]) kw_score = list(np.argsort(-kw_score[0])[:num_kw]) kw_score.sort(reverse=True) # print("num_kw:{}".format(num_kw)) for kw in kw_score: stack.push(KW_OPS[kw]) stack.push("select") elif vet in ("select", "orderBy", "where", "groupBy", "having"): kw = vet current_sql[kw] = [] history[0].append(vet) stack.push(("col", vet)) # score = self.andor.forward(q_emb_var,q_len,hs_emb_var,hs_len) # label = score[0].data.cpu().numpy() # andor_cond = COND_OPS[label] # history.append("") # elif vet == "groupBy": # score = self.having.forward(q_emb_var,q_len,hs_emb_var,hs_len,col_emb_var,col_len,) elif isinstance(vet, tuple) and vet[0] == "col": # print("q_emb_var:{} hs_emb_var:{} col_emb_var:{}".format(q_emb_var.size(), hs_emb_var.size(),col_emb_var.size())) score = self.col.forward(q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len, col_name_len) col_num_score, col_score = [ x.data.cpu().numpy() for x in score ] col_num = np.argmax(col_num_score[0]) + 1 # double check cols = np.argsort(-col_score[0])[:col_num] # print(col_num) # print("col_num_score:{}".format(col_num_score)) # print("col_score:{}".format(col_score)) for col in cols: if vet[1] == "where": stack.push(("op", "where", col)) elif vet[1] != "groupBy": stack.push(("agg", vet[1], col)) elif vet[1] == "groupBy": history[0].append(index_to_column_name(col, tables)) current_sql[kw].append( index_to_column_name(col, tables)) #predict and or or when there is multi col in where condition if col_num > 1 and vet[1] == "where": score = self.andor.forward(q_emb_var, q_len, hs_emb_var, hs_len) label = np.argmax(score[0].data.cpu().numpy()) andor_cond = COND_OPS[label] current_sql[kw].append(andor_cond) if vet[1] == "groupBy" and col_num > 0: score = self.having.forward( q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len, col_name_len, np.full(B, cols[0], dtype=np.int64)) label = np.argmax(score[0].data.cpu().numpy()) if label == 1: has_having = (label == 1) # stack.insert(-col_num,"having") stack.push("having") # history.append(index_to_column_name(cols[-1], tables[0])) elif isinstance(vet, tuple) and vet[0] == "agg": history[0].append(index_to_column_name(vet[2], tables)) if vet[1] not in ("having", "orderBy"): #DEBUG-ed 20180817 try: current_sql[kw].append( index_to_column_name(vet[2], tables)) except Exception as e: # print(e) traceback.print_exc() print("history:{},current_sql:{} stack:{}".format( history[0], current_sql, stack.items)) print("idx_stack:{}".format(idx_stack)) print("sql_stack:{}".format(sql_stack)) exit(1) hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch( history) score = self.agg.forward(q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len, col_name_len, np.full(B, vet[2], dtype=np.int64)) agg_num_score, agg_score = [ x.data.cpu().numpy() for x in score ] agg_num = np.argmax(agg_num_score[0]) # double check agg_idxs = np.argsort(-agg_score[0])[:agg_num] # print("agg:{}".format([AGG_OPS[agg] for agg in agg_idxs])) if len(agg_idxs) > 0: history[0].append(AGG_OPS[agg_idxs[0]]) if vet[1] not in ("having", "orderBy"): current_sql[kw].append(AGG_OPS[agg_idxs[0]]) elif vet[1] == "orderBy": stack.push(("des_asc", vet[2], AGG_OPS[agg_idxs[0]])) #DEBUG-ed 20180817 else: stack.push( ("op", "having", vet[2], AGG_OPS[agg_idxs[0]])) for agg in agg_idxs[1:]: history[0].append(index_to_column_name(vet[2], tables)) history[0].append(AGG_OPS[agg]) if vet[1] not in ("having", "orderBy"): current_sql[kw].append( index_to_column_name(vet[2], tables)) current_sql[kw].append(AGG_OPS[agg]) elif vet[1] == "orderBy": stack.push(("des_asc", vet[2], AGG_OPS[agg])) else: stack.push(("op", "having", vet[2], agg_idxs)) if len(agg_idxs) == 0: if vet[1] not in ("having", "orderBy"): current_sql[kw].append("none_agg") elif vet[1] == "orderBy": stack.push(("des_asc", vet[2], "none_agg")) else: stack.push(("op", "having", vet[2], "none_agg")) # current_sql[kw].append([AGG_OPS[agg] for agg in agg_idxs]) # if vet[1] == "having": # stack.push(("op","having",vet[2],agg_idxs)) # if vet[1] == "orderBy": # stack.push(("des_asc",vet[2],agg_idxs)) # if vet[1] == "groupBy" and has_having: # stack.push("having") elif isinstance(vet, tuple) and vet[0] == "op": if vet[1] == "where": # current_sql[kw].append(index_to_column_name(vet[2], tables)) history[0].append(index_to_column_name(vet[2], tables)) hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch( history) score = self.op.forward(q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len, col_name_len, np.full(B, vet[2], dtype=np.int64)) op_num_score, op_score = [x.data.cpu().numpy() for x in score] op_num = np.argmax( op_num_score[0] ) + 1 # num_score 0 maps to 1 in truth, must have at least one op ops = np.argsort(-op_score[0])[:op_num] # current_sql[kw].append([NEW_WHERE_OPS[op] for op in ops]) if op_num > 0: history[0].append(NEW_WHERE_OPS[ops[0]]) if vet[1] == "having": stack.push(("root_teminal", vet[2], vet[3], ops[0])) else: stack.push(("root_teminal", vet[2], ops[0])) # current_sql[kw].append(NEW_WHERE_OPS[ops[0]]) for op in ops[1:]: history[0].append(index_to_column_name(vet[2], tables)) history[0].append(NEW_WHERE_OPS[op]) # current_sql[kw].append(index_to_column_name(vet[2], tables)) # current_sql[kw].append(NEW_WHERE_OPS[op]) if vet[1] == "having": stack.push(("root_teminal", vet[2], vet[3], op)) else: stack.push(("root_teminal", vet[2], op)) # stack.push(("root_teminal",vet[2])) elif isinstance(vet, tuple) and vet[0] == "root_teminal": score = self.root_teminal.forward( q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len, col_name_len, np.full(B, vet[1], dtype=np.int64)) label = np.argmax(score[0].data.cpu().numpy()) label = ROOT_TERM_OPS[label] if len(vet) == 4: current_sql[kw].append(index_to_column_name( vet[1], tables)) current_sql[kw].append(vet[2]) current_sql[kw].append(NEW_WHERE_OPS[vet[3]]) else: # print("kw:{}".format(kw)) try: current_sql[kw].append( index_to_column_name(vet[1], tables)) except Exception as e: # print(e) traceback.print_exc() print("history:{},current_sql:{} stack:{}".format( history[0], current_sql, stack.items)) print("idx_stack:{}".format(idx_stack)) print("sql_stack:{}".format(sql_stack)) exit(1) current_sql[kw].append(NEW_WHERE_OPS[vet[2]]) if label == "root": history[0].append("root") current_sql[kw].append({}) # current_sql = current_sql[kw][-1] stack.push(("root", current_sql[kw][-1])) else: current_sql[kw].append("terminal") elif isinstance(vet, tuple) and vet[0] == "des_asc": current_sql[kw].append(index_to_column_name(vet[1], tables)) current_sql[kw].append(vet[2]) score = self.des_asc.forward( q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len, col_name_len, np.full(B, vet[1], dtype=np.int64)) label = np.argmax(score[0].data.cpu().numpy()) dec_asc, has_limit = DEC_ASC_OPS[label] history[0].append(dec_asc) current_sql[kw].append(dec_asc) current_sql[kw].append(has_limit) # print("{}".format(current_sql)) if failed: return None print("history:{}".format(history[0])) if len(sql_stack) > 0: current_sql = sql_stack[0] # print("{}".format(current_sql)) return current_sql def gen_col(self, col, table, table_alias_dict): colname = table["column_names_original"][col[2]][1] table_idx = table["column_names_original"][col[2]][0] if table_idx not in table_alias_dict: return colname return "T{}.{}".format(table_alias_dict[table_idx], colname) def gen_group_by(self, sql, kw, table, table_alias_dict): ret = [] for i in range(0, len(sql)): # if len(sql[i+1]) == 0: # if sql[i+1] == "none_agg": ret.append(self.gen_col(sql[i], table, table_alias_dict)) # else: # ret.append("{}({})".format(sql[i+1], self.gen_col(sql[i], table, table_alias_dict))) # for agg in sql[i+1]: # ret.append("{}({})".format(agg,gen_col(sql[i],table,table_alias_dict))) return "{} {}".format(kw, ",".join(ret)) def gen_select(self, sql, kw, table, table_alias_dict): ret = [] for i in range(0, len(sql), 2): # if len(sql[i+1]) == 0: if sql[i + 1] == "none_agg" or not isinstance( sql[i + 1], basestring): #DEBUG-ed 20180817 ret.append(self.gen_col(sql[i], table, table_alias_dict)) else: ret.append("{}({})".format( sql[i + 1], self.gen_col(sql[i], table, table_alias_dict))) # for agg in sql[i+1]: # ret.append("{}({})".format(agg,gen_col(sql[i],table,table_alias_dict))) return "{} {}".format(kw, ",".join(ret)) def gen_where(self, sql, table, table_alias_dict): if len(sql) == 0: return "" start_idx = 0 andor = "and" if isinstance(sql[0], basestring): start_idx += 1 andor = sql[0] ret = [] for i in range(start_idx, len(sql), 3): col = self.gen_col(sql[i], table, table_alias_dict) op = sql[i + 1] val = sql[i + 2] where_item = "" if val == "terminal": where_item = "{} {} '{}'".format(col, op, val) else: val = self.gen_sql(val, table) where_item = "{} {} ({})".format(col, op, val) if op == "between": #TODO temprarily fixed where_item += " and 'terminal'" ret.append(where_item) return "where {}".format(" {} ".format(andor).join(ret)) def gen_orderby(self, sql, table, table_alias_dict): ret = [] limit = "" if sql[-1] == True: limit = "limit 1" for i in range(0, len(sql), 4): if sql[i + 1] == "none_agg" or not isinstance( sql[i + 1], basestring): #DEBUG-ed 20180817 ret.append("{} {}".format( self.gen_col(sql[i], table, table_alias_dict), sql[i + 2])) else: ret.append("{}({}) {}".format( sql[i + 1], self.gen_col(sql[i], table, table_alias_dict), sql[i + 2])) return "order by {} {}".format(",".join(ret), limit) def gen_having(self, sql, table, table_alias_dict): ret = [] for i in range(0, len(sql), 4): if sql[i + 1] == "none_agg": col = self.gen_col(sql[i], table, table_alias_dict) else: col = "{}({})".format( sql[i + 1], self.gen_col(sql[i], table, table_alias_dict)) op = sql[i + 2] val = sql[i + 3] if val == "terminal": ret.append("{} {} '{}'".format(col, op, val)) else: val = self.gen_sql(val, table) ret.append("{} {} ({})".format(col, op, val)) return "having {}".format(",".join(ret)) def find_shortest_path(self, start, end, graph): stack = [[start, []]] visited = set() while len(stack) > 0: ele, history = stack.pop() if ele == end: return history for node in graph[ele]: if node[0] not in visited: stack.append((node[0], history + [(node[0], node[1])])) visited.add(node[0]) print("table {} table {}".format(start, end)) # print("could not find path!!!!!{}".format(self.path_not_found)) self.path_not_found += 1 # return [] def gen_from(self, candidate_tables, table): def find(d, col): if d[col] == -1: return col return find(d, d[col]) def union(d, c1, c2): r1 = find(d, c1) r2 = find(d, c2) if r1 == r2: return d[r1] = r2 ret = "" if len(candidate_tables) <= 1: if len(candidate_tables) == 1: ret = "from {}".format( table["table_names_original"][list(candidate_tables)[0]]) else: ret = "from {}".format(table["table_names_original"][0]) #TODO: temporarily settings return {}, ret # print("candidate:{}".format(candidate_tables)) table_alias_dict = {} uf_dict = {} for t in candidate_tables: uf_dict[t] = -1 idx = 1 graph = defaultdict(list) for acol, bcol in table["foreign_keys"]: t1 = table["column_names"][acol][0] t2 = table["column_names"][bcol][0] graph[t1].append((t2, (acol, bcol))) graph[t2].append((t1, (bcol, acol))) # if t1 in candidate_tables and t2 in candidate_tables: # r1 = find(uf_dict,t1) # r2 = find(uf_dict,t2) # if r1 == r2: # continue # union(uf_dict,t1,t2) # if len(ret) == 0: # ret = "from {} as T{} join {} as T{} on T{}.{}=T{}.{}".format(table["table_names"][t1],idx,table["table_names"][t2], # idx+1,idx,table["column_names_original"][acol][1],idx+1, # table["column_names_original"][bcol][1]) # table_alias_dict[t1] = idx # table_alias_dict[t2] = idx+1 # idx += 2 # else: # if t1 in table_alias_dict: # old_t = t1 # new_t = t2 # acol,bcol = bcol,acol # elif t2 in table_alias_dict: # old_t = t2 # new_t = t1 # else: # ret = "{} join {} as T{} join {} as T{} on T{}.{}=T{}.{}".format(ret,table["table_names"][t1], idx, # table["table_names"][t2], # idx + 1, idx, # table["column_names_original"][acol][1], # idx + 1, # table["column_names_original"][bcol][1]) # table_alias_dict[t1] = idx # table_alias_dict[t2] = idx + 1 # idx += 2 # continue # ret = "{} join {} as T{} on T{}.{}=T{}.{}".format(ret,new_t,idx,idx,table["column_names_original"][acol][1], # table_alias_dict[old_t],table["column_names_original"][bcol][1]) # table_alias_dict[new_t] = idx # idx += 1 # visited = set() candidate_tables = list(candidate_tables) start = candidate_tables[0] table_alias_dict[start] = idx idx += 1 ret = "from {} as T1".format(table["table_names_original"][start]) try: for end in candidate_tables[1:]: if end in table_alias_dict: continue path = self.find_shortest_path(start, end, graph) prev_table = start if not path: table_alias_dict[end] = idx idx += 1 ret = "{} join {} as T{}".format( ret, table["table_names_original"][end], table_alias_dict[end], ) continue for node, (acol, bcol) in path: if node in table_alias_dict: prev_table = node continue table_alias_dict[node] = idx idx += 1 ret = "{} join {} as T{} on T{}.{} = T{}.{}".format( ret, table["table_names_original"][node], table_alias_dict[node], table_alias_dict[prev_table], table["column_names_original"][acol][1], table_alias_dict[node], table["column_names_original"][bcol][1]) prev_table = node except: traceback.print_exc() print("db:{}".format(table["db_id"])) # print(table["db_id"]) return table_alias_dict, ret # if len(candidate_tables) != len(table_alias_dict): # print("error in generate from clause!!!!!") return table_alias_dict, ret def gen_sql(self, sql, table): select_clause = "" from_clause = "" groupby_clause = "" orderby_clause = "" having_clause = "" where_clause = "" nested_clause = "" cols = {} candidate_tables = set() nested_sql = {} nested_label = "" parent_sql = sql # if "sql" in sql: # sql = sql["sql"] if "nested_label" in sql: nested_label = sql["nested_label"] nested_sql = sql["nested_sql"] sql = sql["sql"] elif "sql" in sql: sql = sql["sql"] for key in sql: if key not in KW_WITH_COL: continue for item in sql[key]: if isinstance(item, tuple) and len(item) == 3: if table["column_names"][item[2]][0] != -1: candidate_tables.add(table["column_names"][item[2]][0]) table_alias_dict, from_clause = self.gen_from(candidate_tables, table) ret = [] if "select" in sql: select_clause = self.gen_select(sql["select"], "select", table, table_alias_dict) if len(select_clause) > 0: ret.append(select_clause) else: print("select not found:{}".format(parent_sql)) else: print("select not found:{}".format(parent_sql)) if len(from_clause) > 0: ret.append(from_clause) if "where" in sql: where_clause = self.gen_where(sql["where"], table, table_alias_dict) if len(where_clause) > 0: ret.append(where_clause) if "groupBy" in sql: ## DEBUG-ed order groupby_clause = self.gen_group_by(sql["groupBy"], "group by", table, table_alias_dict) if len(groupby_clause) > 0: ret.append(groupby_clause) if "orderBy" in sql: orderby_clause = self.gen_orderby(sql["orderBy"], table, table_alias_dict) if len(orderby_clause) > 0: ret.append(orderby_clause) if "having" in sql: having_clause = self.gen_having(sql["having"], table, table_alias_dict) if len(having_clause) > 0: ret.append(having_clause) if len(nested_label) > 0: nested_clause = "{} {}".format(nested_label, self.gen_sql(nested_sql, table)) if len(nested_clause) > 0: ret.append(nested_clause) return " ".join(ret) def check_acc(self, pred_sql, gt_sql): pass
def __init__(self, word_emb, N_word, N_h=100, N_depth=2, gpu=False, use_ca=True, trainable_emb=False): super(SQLNet, self).__init__() self.use_ca = use_ca self.trainable_emb = trainable_emb self.gpu = gpu self.N_h = N_h self.N_depth = N_depth self.max_col_num = 45 self.max_tok_num = 200 self.SQL_TOK = [ '<UNK>', '<END>', 'WHERE', 'AND', 'OR', '==', '>', '<', '!=', '<BEG>' ] self.COND_OPS = ['>', '<', '==', '!='] # 词向量,可选择自己训练或者使用训练好的词向量,这里选用加载好的词向量 self.embed_layer = WordEmbedding(word_emb, N_word, gpu, self.SQL_TOK, our_model=True, trainable=trainable_emb) # 预测列数目 self.sel_num = SelNumPredictor(N_word, N_h, N_depth, use_ca=use_ca) # 预测那个列被选中了 self.sel_pred = SelPredictor(N_word, N_h, N_depth, self.max_tok_num, use_ca=use_ca) # 预测相应选定列的聚合函数 self.agg_pred = AggPredictor(N_word, N_h, N_depth, use_ca=use_ca) # 预测条件数、条件列、条件操作和条件值 self.cond_pred = SQLNetCondPredictor(N_word, N_h, N_depth, self.max_col_num, self.max_tok_num, use_ca, gpu) # 预测条件关系,如“and”、“or” self.where_rela_pred = WhereRelationPredictor(N_word, N_h, N_depth, use_ca=use_ca) self.CE = nn.CrossEntropyLoss() #交叉熵损失函数 self.softmax = nn.Softmax(dim=-1) self.log_softmax = nn.LogSoftmax() self.bce_logit = nn.BCEWithLogitsLoss() if gpu: self.cuda()
import pandas as pd from word_embedding import WordEmbedding from solution.clean import CleanText ct = CleanText() data = pd.read_csv("data/Categorie_original.zip",sep=";").fillna("") ct.clean_df_column(data, "Description", "Description_cleaned") array_token = [line.split(" ") for line in data["Description_cleaned"].values] print(len(array_token)) features_dimension = 300 min_count = 1 window = 5 hs = 0 negative = 10 we_sg = WordEmbedding(word_embedding_type = "word2vec", args = dict(sentences = array_token, sg=1, hs=hs, negative=negative, min_count=min_count, size=features_dimension, window = window, iter=15)) model_sg, training_time_sg = we_sg.train() print("Model Skip-gram trained in %.2f minutes"%(training_time_sg/60)) we_cbow = WordEmbedding(word_embedding_type = "word2vec", args = dict(sentences = array_token, sg=0, hs=hs, negative=negative, min_count=min_count, size=features_dimension, window = window, iter=15)) model_cbow, training_time_cbow = we_cbow.train() print("Model CBOW trained in %.2f minutes"%(training_time_cbow/60)) model_sg.save("data/full_model_sg") model_cbow.save("data/full_model_cbow")
class Seq2SQL(nn.Module): def __init__(self, word_emb, num_words, num_hidden=100, num_layers=2, use_gpu=True): super(Seq2SQL, self).__init__() self.word_emb = word_emb self.num_words = num_words self.num_hidden = num_hidden self.num_layers = num_layers self.use_gpu = use_gpu self.max_col_num = 45 self.max_tok_num = 200 self.COND_OPS = ['EQL', 'GT', 'LT'] self.SQL_TOK = ['<UNK>', '<BEG>', '<END>', 'WHERE', 'AND' ] + self.COND_OPS # GloVe Word Embedding self.embed_layer = WordEmbedding(word_emb, num_words, self.SQL_TOK, use_gpu) # Aggregation Classifier self.agg_classifier = AggregationClassifier(num_words, num_hidden, num_layers) # SELECT Column(s) self.sel_classifier = SelectClassifier(num_words, num_hidden, num_layers, self.max_tok_num) # WHERE Clause self.whr_classifier = WhereClassifier(num_words, num_hidden, num_layers, self.max_col_num, self.max_tok_num, use_gpu) # run on GPU if use_gpu: self.cuda() def generate_g_s(self, q, col, query): # data format # <BEG> WHERE cond1_col cond1_op cond1 # AND cond2_col cond2_op cond2 # AND ... <END> ret_seq = [] for cur_q, cur_col, cur_query in zip(q, col, query): connect_col = [ tok for col_tok in cur_col for tok in col_tok + [','] ] all_toks = self.SQL_TOK + connect_col + [None] + cur_q + [None] cur_seq = [all_toks.index('<BEG>')] if 'WHERE' in cur_query: cur_where_query = cur_query[cur_query.index('WHERE'):] cur_seq = cur_seq + map( lambda tok: all_toks.index(tok) if tok in all_toks else 0, cur_where_query) cur_seq.append(all_toks.index('<END>')) ret_seq.append(cur_seq) return ret_seq def forward(self, q, col, col_num, classif_flag, g_s=None, reinforce=False): agg_classif, sel_classif, whr_classif = classif_flag agg_score, sel_score, whr_score = None, None, None x_emb_var, x_len = self.embed_layer.gen_x_batch(q, col) if agg_classif: agg_score = self.agg_classifier(x_emb_var, x_len) if sel_classif: col_inp_var, col_name_len, col_len = self.embed_layer.gen_col_batch( col) sel_score = self.sel_classifier(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num) if whr_classif: whr_score = self.whr_classifier(x_emb_var, x_len, g_s, reinforce=reinforce) return (agg_score, sel_score, whr_score) def loss(self, score, ref_score, classif_flag, g_s): agg_classif, sel_classif, whr_classif = classif_flag agg_score, sel_score, whr_score = score loss = 0 if agg_classif: agg_ref = torch.from_numpy(np.array(map(lambda x: x[0], ref_score))) agg_ref_var = Variable(agg_ref) if self.use_gpu: agg_ref_var = agg_ref_var.cuda() loss += nn.CrossEntropyLoss()(agg_score, agg_ref_var) if sel_classif: sel_ref = torch.from_numpy(np.array(map(lambda x: x[1], ref_score))) sel_ref_var = Variable(sel_ref) if self.use_gpu: sel_ref_var = sel_ref_var.cuda() loss += nn.CrossEntropyLoss()(sel_score, sel_ref_var) if whr_classif: g_s_len = len(g_s) for s, g_s_i in enumerate(g_s): whr_ref_var = Variable(torch.from_numpy(np.array(g_s_i[1:]))) if self.use_gpu: whr_ref_var = whr_ref_var.cuda() loss += (nn.CrossEntropyLoss()(whr_score[s, :len(g_s_i) - 1], whr_ref_var) / g_s_len) return loss def reinforce_backward(self, score, rewards): agg_score, sel_score, whr_score = score cur_reward = rewards[:] eof = self.SQL_TOK.index('<END>') for whr_score_t in whr_score[1]: reward_inp = torch.FloatTensor(cur_reward).unsqueeze(1) if self.use_gpu: reward_inp = reward_inp.cuda() whr_score_t.reinforce(reward_inp) for b, _ in enumerate(rewards): if whr_score_t[b].data.cpu().numpy()[0] == eof: cur_reward[b] = 0 torch.autograd.backward(whr_score[1], [None for _ in whr_score[1]]) return def check_acc(self, classif_queries, g_s_queries, classif_flag): agg_classif, sel_classif, whr_classif = classif_flag tot_err = agg_err = sel_err = whr_err = whr_num_err = whr_col_err = whr_op_err = whr_val_err = 0.0 for classif_qry, g_s_qry in zip(classif_queries, g_s_queries): agg_err_inc = 1 if agg_classif and classif_qry['agg'] != g_s_qry[ 'agg'] else 0 agg_err += agg_err_inc sel_err_inc = 1 if sel_classif and classif_qry['sel'] != g_s_qry[ 'sel'] else 0 sel_err += sel_err_inc if whr_classif: flag = True whr_classifier = classif_qry['conds'] whr_g_s = g_s_qry['conds'] if len(whr_classifier) != len(whr_g_s): flag = False whr_num_err += 1 elif set(x[0] for x in whr_classifier) != set(x[0] for x in whr_g_s): flag = False whr_col_err += 1 if flag: for whr_class_i in whr_classifier: g_s_idx = tuple(x[0] for x in whr_g_s).index(whr_class_i[0]) if flag and whr_g_s[g_s_idx][1] != whr_class_i[1]: flag = False whr_op_err += 1 break if flag: for whr_class_i in whr_classifier: g_s_idx = tuple(x[0] for x in whr_g_s).index(whr_class_i[0]) if flag and unicode(whr_g_s[g_s_idx][2]).lower() != \ unicode(whr_class_i[2]).lower(): flag = False whr_val_err += 1 break if not flag: whr_err += 1 if agg_err_inc > 0 or sel_err_inc > 0 or not flag: tot_err += 1 return np.array((agg_err, sel_err, whr_err)), tot_err def gen_query(self, score, q, col, raw_q, raw_col, classif_flag, reinforce=False, verbose=False): def merge_tokens(tok_list, raw_tok_str): tok_str = raw_tok_str.lower() special = { '-LRB-': '(', '-RRB-': ')', '-LSB-': '[', '-RSB-': ']', '``': '"', '\'\'': '"', '--': u'\u2013' } ret = '' double_quote_pair_track = 0 for raw_tok in tok_list: if not raw_tok: continue tok = special.get(raw_tok, raw_tok) if tok == '"': double_quote_pair_track = 1 - double_quote_pair_track if double_quote_pair_track: ret = ret + ' ' if len(ret) == 0: pass elif len(ret) > 0 and ret + ' ' + tok in tok_str: ret = ret + ' ' elif len(ret) > 0 and ret + tok in tok_str: pass elif (tok[0] not in string.ascii_lowercase) and ( tok[0] not in string.digits) and (tok[0] not in '$('): pass elif (ret[-1] not in ['(', '/', u'\u2013', '#', '$', '&']) and \ (ret[-1] != '"' or not double_quote_pair_track): ret = ret + ' ' ret = ret + tok return ret.strip() agg_classif, sel_classif, whr_classif = classif_flag agg_score, sel_score, whr_score = score ret_queries = [] batch_len = len(agg_score) if agg_classif else len( sel_score) if sel_classif else len( whr_score[0]) if reinforce else len(whr_score) for b in range(batch_len): cur_query = {} if agg_classif: cur_query['agg'] = np.argmax(agg_score[b].data.cpu().numpy()) if sel_classif: cur_query['sel'] = np.argmax(sel_score[b].data.cpu().numpy()) if whr_classif: cur_query['conds'] = [] all_toks = self.SQL_TOK + [ x for toks in col[b] for x in toks + [','] ] + [''] + q[b] + [''] whr_toks = [] if reinforce: for choices in whr_score[1]: if choices[b].data.cpu().numpy()[0] < len(all_toks): whr_val = all_toks[choices[b].data.cpu().numpy() [0]] else: whr_val = '<UNK>' if whr_val == '<END>': break whr_toks.append(whr_val) else: for where_score in whr_score[b].data.cpu().numpy(): whr_tok = np.argmax(where_score) whr_val = all_toks[whr_tok] if whr_val == '<END>': break whr_toks.append(whr_val) if verbose: print whr_toks if len(whr_toks) > 0: whr_toks = whr_toks[1:] st = 0 while st < len(whr_toks): cur_cond = [None, None, None] ed = len(whr_toks) if 'AND' not in whr_toks[st:] \ else whr_toks[st:].index('AND') + st if 'EQL' in whr_toks[st:ed]: op = whr_toks[st:ed].index('EQL') + st cur_cond[1] = 0 elif 'GT' in whr_toks[st:ed]: op = whr_toks[st:ed].index('GT') + st cur_cond[1] = 1 elif 'LT' in whr_toks[st:ed]: op = whr_toks[st:ed].index('LT') + st cur_cond[1] = 2 else: op = st cur_cond[1] = 0 sel_col = whr_toks[st:op] to_idx = [x.lower() for x in raw_col[b]] classif_col = merge_tokens(sel_col, raw_q[b] + ' || ' + \ ' || '.join(raw_col[b])) if classif_col in to_idx: cur_cond[0] = to_idx.index(classif_col) else: cur_cond[0] = 0 cur_cond[2] = merge_tokens(whr_toks[op + 1:ed], raw_q[b]) cur_query['conds'].append(cur_cond) st = ed + 1 ret_queries.append(cur_query) return ret_queries
elif args.train_component == "keyword": model = KeyWordPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "col": model = ColPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "op": model = OpPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "agg": model = AggPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "root_tem": model = RootTeminalPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "des_asc": model = DesAscLimitPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "having": model = HavingPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "andor": model = AndOrPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) elif args.train_component == "from": model = FromPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0) if BERT: optimizer_bert = torch.optim.Adam(bert_model.parameters(), lr=bert_learning_rate) else: optimizer_bert = None print("finished build model") print_flag = False model.load_state_dict(torch.load(args.load_path)) embed_layer = WordEmbedding(word_emb, N_word, gpu=GPU, SQL_TOK=SQL_TOK, use_bert=BERT, trainable=False) acc = epoch_acc(model, BATCH_SIZE, args.train_component, embed_layer, dev_data, table_type=args.table_type) print("finished: {}".format(time.time() - start_time))
def bolukbasi_debias_original(embedding, word_pairs, out_file, excludes=None, mirrors=None, **kwargs): # type: (WordEmbedding, Iterable[Tuple[str, str]], Path, Iterable[str], Iterable[Tuple[str, str]], **Any) -> WordEmbedding """Debias a word embedding using Bolukbasi's original algorithm. Adapted from https://github.com/tolga-b/debiaswe/blob/master/debiaswe/debias.py#L19 Commit 10277b23e187ee4bd2b6872b507163ef4198686b on 2018-04-02 Parameters: embedding (WordEmbedding): The word embedding to debias. word_pairs (Iterable[Tuple[str, str]]): A list of word pairs that define the bias subspace. out_file (Path): The path to save the new embedding to. excludes (Iterable[str]): A collection of words to be excluded from the debiasing mirrors (Iterable[Tuple[str, str]]): Specific words that should be equidistant. **kwargs: Other keyword arguments. Returns: WordEmbedding: The debiased word embedding. """ if out_file.exists(): return WordEmbedding.load_word2vec_file(out_file) # define the bias subspace # recenter words matrix = [] for male_word, female_word in word_pairs: if male_word not in embedding or female_word not in embedding: continue matrix.extend( recenter(np.array([embedding[male_word], embedding[female_word]]))) bias_subspace = define_bias_subspace(matrix, **kwargs) bias_subspace = _align_gender_direction(embedding, bias_subspace, word_pairs) bias_subspace = bias_subspace[np.newaxis, :] # debias by rejecting the subspace and reverting the excluded words if excludes is None: excludes = set() new_vectors = reject(embedding.vectors, bias_subspace) for word in excludes: if word in embedding: new_vectors[embedding.index(word)] = embedding[word] new_vectors = normalize(new_vectors) # FIXME does equalizing make sense in higher dimensions? #new_vectors = _bolukbasi_equalize(embedding, new_vectors, bias_subspace, mirrors) # create a word embedding from the new vectors new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors) new_embedding.source = out_file new_embedding.save() return new_embedding
def make_dataset(use_full_dataset=True): # Make txt dataset txt_dataset = _make_text_dataset(use_full_dataset=use_full_dataset) # Load full stanford embedding from file stanford = pickle.load(open(constants.GLOVE_EMBEDDING_STANFORD_PATH, "rb")) # Create vocabulary, cut at top 20k words word_index = _make_capped_word_index(stanford, txt_dataset) # Reduce embedding matrix to include top 20k words embedding_vectors = _make_embeddings(stanford, word_index) # Create ID dataset, with <pad>'s and <unk>'s id_dataset = make_id_dataset(txt_dataset, word_index) print("Creating and saving indices...") # Make ordering indices for shuffling data N = len(id_dataset["train_tweets"]) index = np.arange(N) np.random.seed(constants.SEED) np.random.shuffle(index) # Divide indices into train and test indices divider = int(constants.SPLIT_RATIO * N) train_index = index[:divider] test_index = index[divider:] data_index = {"train_index": train_index, "test_index": test_index} # Save data index index_path = constants.DATA_INDEX_SMALL_PATH if use_full_dataset: index_path = constants.DATA_INDEX_FULL_PATH pickle.dump(data_index, open(index_path, "wb")) # Save to pickle print("Saving word embeddings...") word_embedding_20k = WordEmbedding(embedding_vectors, word_index) pickle.dump(word_embedding_20k, open(constants.STANFORD_20K_EMBEDDING_PATH, "wb")) print("Saving txt dataset...") txt_dataset_path = constants.TXT_DATASET_SMALL_PATH if use_full_dataset: txt_dataset_path = constants.TXT_DATASET_FULL_PATH pickle.dump(txt_dataset, open(txt_dataset_path, "wb")) print("Saving id dataset...") id_dataset_path = constants.ID_DATASET_SMALL_PATH if use_full_dataset: id_dataset_path = constants.ID_DATASET_FULL_PATH pickle.dump(id_dataset, open(id_dataset_path, "wb")) # Plot tweet length distribution tweets_lengths = np.array( [len(tweet) for tweet in txt_dataset["train_tweets"]]) plt.hist(tweets_lengths, bins=50, edgecolor="black") plt.xlabel("Tweet length") plt.ylabel("Frequency") plt.savefig(constants.PLOTS_DIR + "tweet_lengths.eps", format="eps", dpi=1000, bbox_inches="tight") # Print tweet fraction with length <= 40 words frac_max_40_words = len( tweets_lengths[tweets_lengths <= 40]) / len(tweets_lengths) print("Fraction of tweets with length <= 40 words:", frac_max_40_words)
return len(self.words) def dim(self): return self.embedding.dim def get_chunk(self): chunk_words = [] got_good_words = False while not got_good_words: sta_ind = np.random.randint(0, len(self) - self.chunk_size - 1) end_ind = sta_ind + self.chunk_size chunk_words = self.words[sta_ind:end_ind] got_good_words = all([word in embedding for word in chunk_words]) vec_chunk = np.stack([self.embedding[word] for word in chunk_words]) return torch.from_numpy(vec_chunk) def get_chunks(self, n_chunks): return torch.stack([self.get_chunk() for _ in range(n_chunks)]) if __name__ == '__main__': embedding_fn = '/Users/bkeating/nltk_data/embeddings/glove/glove.6B.100d.txt' embedding = WordEmbedding(embedding_fn) dataset = WordDataset('data/keywell_corpus.txt', embedding) chunk = dataset.get_chunk() print(chunk.size()) chunks = dataset.get_chunks(20) print(chunks.size())
class SQLNet(nn.Module): def __init__(self, word_emb, N_word, N_h=100, N_depth=2, gpu=False, use_ca=True, trainable_emb=False): super(SQLNet, self).__init__() self.use_ca = use_ca self.trainable_emb = trainable_emb self.gpu = gpu self.N_h = N_h self.N_depth = N_depth self.max_col_num = 45 self.max_tok_num = 200 self.SQL_TOK = [ '<UNK>', '<END>', 'WHERE', 'AND', 'OR', '==', '>', '<', '!=', '<BEG>' ] self.COND_OPS = ['>', '<', '==', '!='] # 词向量,可选择自己训练或者使用训练好的词向量,这里选用加载好的词向量 self.embed_layer = WordEmbedding(word_emb, N_word, gpu, self.SQL_TOK, our_model=True, trainable=trainable_emb) # 预测列数目 self.sel_num = SelNumPredictor(N_word, N_h, N_depth, use_ca=use_ca) # 预测那个列被选中了 self.sel_pred = SelPredictor(N_word, N_h, N_depth, self.max_tok_num, use_ca=use_ca) # 预测相应选定列的聚合函数 self.agg_pred = AggPredictor(N_word, N_h, N_depth, use_ca=use_ca) # 预测条件数、条件列、条件操作和条件值 self.cond_pred = SQLNetCondPredictor(N_word, N_h, N_depth, self.max_col_num, self.max_tok_num, use_ca, gpu) # 预测条件关系,如“and”、“or” self.where_rela_pred = WhereRelationPredictor(N_word, N_h, N_depth, use_ca=use_ca) self.CE = nn.CrossEntropyLoss() #交叉熵损失函数 self.softmax = nn.Softmax(dim=-1) self.log_softmax = nn.LogSoftmax() self.bce_logit = nn.BCEWithLogitsLoss() if gpu: self.cuda() # q:问题,gt_cond_seq:三元组 目的:要选择那一列 def generate_gt_where_seq_test(self, q, gt_cond_seq): ret_seq = [] for cur_q, ans in zip(q, gt_cond_seq): temp_q = u"".join(cur_q) cur_q = [u'<BEG>'] + cur_q + [u'<END>'] # 在每个问题前加<BEG>和结尾加<END> record = [] #如果条件值在问题中,标记(TRUE,条件值) record_cond = [] for cond in ans: if cond[2] not in temp_q: record.append((False, cond[2])) else: record.append((True, cond[2])) for idx, item in enumerate(record): temp_ret_seq = [] if item[0]: temp_ret_seq.append(0) temp_ret_seq.extend( list( range( temp_q.index(item[1]) + 1, temp_q.index(item[1]) + len(item[1]) + 1))) #获取条件值的索引 temp_ret_seq.append(len(cur_q) - 1) else: temp_ret_seq.append([0, len(cur_q) - 1]) record_cond.append(temp_ret_seq) ret_seq.append(record_cond) return ret_seq #q:问题,col:表头名字,col_num:有几个表头列,gt_where:conds中条件值不出现在问题中,gt_conds:conds,gt_sel:选择那列,gt_sel_num:选择几列 def forward(self, q, col, col_num, gt_where=None, gt_cond=None, reinforce=False, gt_sel=None, gt_sel_num=None): B = len(q) #batch_size的大小 sel_num_score = None agg_score = None sel_score = None cond_score = None #预测聚合函数 if self.trainable_emb: x_emb_var, x_len = self.agg_embed_layer.gen_x_batch(q, col) col_inp_var, col_name_len, col_len = self.agg_embed_layer.gen_col_batch( col) max_x_len = max(x_len) agg_score = self.agg_pred(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num, gt_sel=gt_sel) x_emb_var, x_len = self.sel_embed_layer.gen_x_batch(q, col) col_inp_var, col_name_len, col_len = self.sel_embed_layer.gen_col_batch( col) max_x_len = max(x_len) sel_score = self.sel_pred(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num) x_emb_var, x_len = self.cond_embed_layer.gen_x_batch(q, col) col_inp_var, col_name_len, col_len = self.cond_embed_layer.gen_col_batch( col) max_x_len = max(x_len) cond_score = self.cond_pred(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num, gt_where, gt_cond, reinforce=reinforce) where_rela_score = None else: x_emb_var, x_len = self.embed_layer.gen_x_batch( q, col ) #x_len:batch中每个问题的长度,[x_emb_var:batch_size,max_seq_len,word_embedding_size] col_inp_var, col_name_len, col_len = self.embed_layer.gen_col_batch( col) #列名向量化,长度,几个列 sel_num_score = self.sel_num( x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num) #[16,4]对问题的编码经过lstm,linear,softmax之后乘以编码 # x_emb_var: embedding of each question # x_len: length of each question # col_inp_var: embedding of each header # col_name_len: length of each header # col_len: number of headers in each table, array type # col_num: number of headers in each table, list type if gt_sel_num: pr_sel_num = gt_sel_num else: pr_sel_num = np.argmax(sel_num_score.data.cpu().numpy(), axis=1) sel_score = self.sel_pred(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num) #【16,19】 if gt_sel: pr_sel = gt_sel else: num = np.argmax(sel_num_score.data.cpu().numpy(), axis=1) sel = sel_score.data.cpu().numpy() pr_sel = [ list(np.argsort(-sel[b])[:num[b]]) for b in range(len(num)) ] agg_score = self.agg_pred(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num, gt_sel=pr_sel, gt_sel_num=pr_sel_num) #【16,4,6】 where_rela_score = self.where_rela_pred(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num) #【16,3】 cond_score = self.cond_pred(x_emb_var, x_len, col_inp_var, col_name_len, col_len, col_num, gt_where, gt_cond, reinforce=reinforce) #4=>[16,5] return (sel_num_score, sel_score, agg_score, cond_score, where_rela_score) def loss(self, score, truth_num, gt_where): sel_num_score, sel_score, agg_score, cond_score, where_rela_score = score B = len(truth_num) loss = 0 # Evaluate select number sel_num_truth = list(map(lambda x: x[0], truth_num)) #聚合函数个数 sel_num_truth = torch.from_numpy( np.array(sel_num_truth)).long() #.astype(float)) if self.gpu: sel_num_truth = Variable(sel_num_truth.cuda()) else: sel_num_truth = Variable(sel_num_truth) #选择几个列的损失 loss += self.CE(sel_num_score, sel_num_truth) # Evaluate select column选择哪个列的损失 T = len(sel_score[0]) truth_prob = np.zeros((B, T), dtype=np.float32) for b in range(B): truth_prob[b][list(truth_num[b][1])] = 1 data = torch.from_numpy(truth_prob) if self.gpu: sel_col_truth_var = Variable(data.cuda()) else: sel_col_truth_var = Variable(data) sigm = nn.Sigmoid() sel_col_prob = sigm(sel_score) bce_loss = -torch.mean( 3 * (sel_col_truth_var * torch.log(sel_col_prob + 1e-10)) + (1 - sel_col_truth_var) * torch.log(1 - sel_col_prob + 1e-10) ) #这儿采用bceloss:-w*[y*log(x)+(1-y)*log(1-x)] loss += bce_loss # Evaluate select aggregation选择聚合函数的损失交叉熵 for b in range(len(truth_num)): data = torch.from_numpy(np.array(truth_num[b][2])) #真实的聚合函数 if self.gpu: sel_agg_truth_var = Variable(data.cuda()) else: sel_agg_truth_var = Variable(data.long()) sel_agg_pred = agg_score[b, :len(truth_num[b][1])] #聚合函数共六种 loss += (self.CE(sel_agg_pred, sel_agg_truth_var)) / len(truth_num) cond_num_score, cond_col_score, cond_op_score, cond_str_score = cond_score # Evaluate the number of conditions预测多少个conds的损失交叉熵 cond_num_truth = list(map(lambda x: x[3], truth_num)) data = torch.from_numpy(np.array(cond_num_truth).astype(float)).long() if self.gpu: try: cond_num_truth_var = Variable(data.cuda()) except: print("cond_num_truth_var error") print(data) exit(0) else: cond_num_truth_var = Variable(data) loss += self.CE(cond_num_score, cond_num_truth_var) # Evaluate the columns of conditions评估条件列 T = len(cond_col_score[0]) truth_prob = np.zeros((B, T), dtype=np.float32) for b in range(B): if len(truth_num[b][4]) > 0: truth_prob[b][list(truth_num[b][4])] = 1 #条件列 data = torch.from_numpy(truth_prob) if self.gpu: cond_col_truth_var = Variable(data.cuda()) else: cond_col_truth_var = Variable(data) sigm = nn.Sigmoid() cond_col_prob = sigm(cond_col_score) bce_loss = -torch.mean( 3 * (cond_col_truth_var * torch.log(cond_col_prob + 1e-10)) + (1 - cond_col_truth_var) * torch.log(1 - cond_col_prob + 1e-10)) loss += bce_loss # Evaluate the operator of conditions评估操作条件 for b in range(len(truth_num)): if len(truth_num[b][5]) == 0: #条件类型 continue data = torch.from_numpy(np.array(truth_num[b][5])).long() if self.gpu: cond_op_truth_var = Variable(data.cuda()) else: cond_op_truth_var = Variable(data) cond_op_pred = cond_op_score[b, :len(truth_num[b][5])] # try: loss += (self.CE(cond_op_pred, cond_op_truth_var) / len(truth_num)) # except: # print(cond_op_pred) # print(cond_op_truth_var) # exit(0) #Evaluate the strings of conditions评估条件串 for b in range(len(gt_where)): for idx in range(len(gt_where[b])): cond_str_truth = gt_where[b][idx] if len(cond_str_truth) == 1: continue data = torch.from_numpy(np.array(cond_str_truth[1:])).long() if self.gpu: cond_str_truth_var = Variable(data.cuda()) else: cond_str_truth_var = Variable(data) str_end = len(cond_str_truth) - 1 cond_str_pred = cond_str_score[b, idx, :str_end] loss += (self.CE(cond_str_pred, cond_str_truth_var) \ / (len(gt_where) * len(gt_where[b]))) # Evaluate condition relationship, and / or评估条件关系 where_rela_truth = list(map(lambda x: x[6], truth_num)) data = torch.from_numpy(np.array(where_rela_truth)).long() if self.gpu: try: where_rela_truth = Variable(data.cuda()) except: print("where_rela_truth error") print(data) exit(0) else: where_rela_truth = Variable(data) loss += self.CE(where_rela_score, where_rela_truth) return loss def check_acc(self, vis_info, pred_queries, gt_queries): def gen_cond_str(conds, header): if len(conds) == 0: return 'None' cond_str = [] for cond in conds: cond_str.append(header[cond[0]] + ' ' + self.COND_OPS[cond[1]] + ' ' + unicode(cond[2]).lower()) return 'WHERE ' + ' AND '.join(cond_str) tot_err = sel_num_err = agg_err = sel_err = 0.0 cond_num_err = cond_col_err = cond_op_err = cond_val_err = cond_rela_err = 0.0 for b, (pred_qry, gt_qry) in enumerate(zip(pred_queries, gt_queries)): good = True sel_pred, agg_pred, where_rela_pred = pred_qry['sel'], pred_qry[ 'agg'], pred_qry['cond_conn_op'] sel_gt, agg_gt, where_rela_gt = gt_qry['sel'], gt_qry[ 'agg'], gt_qry['cond_conn_op'] if where_rela_gt != where_rela_pred: good = False cond_rela_err += 1 if len(sel_pred) != len(sel_gt): good = False sel_num_err += 1 pred_sel_dict = { k: v for k, v in zip(list(sel_pred), list(agg_pred)) } gt_sel_dict = {k: v for k, v in zip(sel_gt, agg_gt)} if set(sel_pred) != set(sel_gt): good = False sel_err += 1 agg_pred = [pred_sel_dict[x] for x in sorted(pred_sel_dict.keys())] agg_gt = [gt_sel_dict[x] for x in sorted(gt_sel_dict.keys())] if agg_pred != agg_gt: good = False agg_err += 1 cond_pred = pred_qry['conds'] cond_gt = gt_qry['conds'] if len(cond_pred) != len(cond_gt): good = False cond_num_err += 1 else: cond_op_pred, cond_op_gt = {}, {} cond_val_pred, cond_val_gt = {}, {} for p, g in zip(cond_pred, cond_gt): cond_op_pred[p[0]] = p[1] cond_val_pred[p[0]] = p[2] cond_op_gt[g[0]] = g[1] cond_val_gt[g[0]] = g[2] if set(cond_op_pred.keys()) != set(cond_op_gt.keys()): cond_col_err += 1 good = False where_op_pred = [ cond_op_pred[x] for x in sorted(cond_op_pred.keys()) ] where_op_gt = [ cond_op_gt[x] for x in sorted(cond_op_gt.keys()) ] if where_op_pred != where_op_gt: cond_op_err += 1 good = False where_val_pred = [ cond_val_pred[x] for x in sorted(cond_val_pred.keys()) ] where_val_gt = [ cond_val_gt[x] for x in sorted(cond_val_gt.keys()) ] if where_val_pred != where_val_gt: cond_val_err += 1 good = False if not good: tot_err += 1 return np.array( (sel_num_err, sel_err, agg_err, cond_num_err, cond_col_err, cond_op_err, cond_val_err, cond_rela_err)), tot_err def gen_query(self, score, q, col, raw_q, reinforce=False, verbose=False): """ :param score: :param q: token-questions :param col: token-headers :param raw_q: original question sequence :return: """ def merge_tokens(tok_list, raw_tok_str): tok_str = raw_tok_str # .lower() alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789$(' special = { '-LRB-': '(', '-RRB-': ')', '-LSB-': '[', '-RSB-': ']', '``': '"', '\'\'': '"', '--': u'\u2013' } ret = '' double_quote_appear = 0 for raw_tok in tok_list: if not raw_tok: continue tok = special.get(raw_tok, raw_tok) if tok == '"': double_quote_appear = 1 - double_quote_appear if len(ret) == 0: pass elif len(ret) > 0 and ret + ' ' + tok in tok_str: ret = ret + ' ' elif len(ret) > 0 and ret + tok in tok_str: pass elif tok == '"': if double_quote_appear: ret = ret + ' ' # elif tok[0] not in alphabet: # pass elif (ret[-1] not in ['(', '/', u'\u2013', '#', '$', '&']) \ and (ret[-1] != '"' or not double_quote_appear): ret = ret + ' ' ret = ret + tok return ret.strip() sel_num_score, sel_score, agg_score, cond_score, where_rela_score = score # [64,4,6], [64,14], ..., [64,4] sel_num_score = sel_num_score.data.cpu().numpy() sel_score = sel_score.data.cpu().numpy() agg_score = agg_score.data.cpu().numpy() where_rela_score = where_rela_score.data.cpu().numpy() ret_queries = [] B = len(agg_score) cond_num_score,cond_col_score,cond_op_score,cond_str_score =\ [x.data.cpu().numpy() for x in cond_score] for b in range(B): cur_query = {} cur_query['sel'] = [] cur_query['agg'] = [] sel_num = np.argmax(sel_num_score[b]) max_col_idxes = np.argsort(-sel_score[b])[:sel_num] # find the most-probable columns' indexes max_agg_idxes = np.argsort(-agg_score[b])[:sel_num] cur_query['sel'].extend([int(i) for i in max_col_idxes]) cur_query['agg'].extend([i[0] for i in max_agg_idxes]) cur_query['cond_conn_op'] = np.argmax(where_rela_score[b]) cur_query['conds'] = [] cond_num = np.argmax(cond_num_score[b]) all_toks = ['<BEG>'] + q[b] + ['<END>'] max_idxes = np.argsort(-cond_col_score[b])[:cond_num] for idx in range(cond_num): cur_cond = [] cur_cond.append(max_idxes[idx]) # where-col cur_cond.append(np.argmax(cond_op_score[b][idx])) # where-op cur_cond_str_toks = [] for str_score in cond_str_score[b][idx]: str_tok = np.argmax(str_score[:len(all_toks)]) str_val = all_toks[str_tok] if str_val == '<END>': break cur_cond_str_toks.append(str_val) cur_cond.append(merge_tokens(cur_cond_str_toks, raw_q[b])) cur_query['conds'].append(cur_cond) ret_queries.append(cur_query) return ret_queries
eva_tfidf = Evaluation(tweets_tfidf) conf_matrix = eva_tfidf.build_confusion_matrix(tweets_tfidf) print "Confusion matrix:" print conf_matrix print "Accuracy using TF-IDF weighting algorithm: {}".format( eva_tfidf.accuracy()) print "Average Precision using TF-IDF weighting algorithm: {}".format( eva_tfidf.average_precision()) print "Average Recall using TF-IDF weighting algorithm: {}".format( eva_tfidf.average_recall()) # baca dataset # preprocess data: buang punctuation data = Dataset() data.load_dataset() data.cleanse_dataset() data.build_dictionaries() tweets = data.get_dataset() tweets_rake = tweets.copy() tweets_tfidf = tweets.copy() emb = WordEmbedding() emb_vec = emb.load_embedding(emb_type='fasttext-id') asp = Aspects() run_experiment_with_rake() run_experiment_with_tfidf(tweets_tfidf)
class LDA2Vec: def __init__(self, num_docs, vocab_size, num_topics, embedding_size, freqs, batch_size, save_graph, num_sampled=40): self.num_docs = num_docs self.vocab_size = vocab_size self.num_topics = num_topics self.embedding_size = embedding_size self.freqs = freqs self.batch_size = batch_size self.save_graph = save_graph self.num_sampled = num_sampled self.lmbda = 200.0 self.learning_rate = 0.001 self.moving_avgs = tf.train.ExponentialMovingAverage(0.9) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.sesh = tf.Session(config=self.config) self.computed_norm = False self.logdir = "_".join( ("lda2vec", datetime.now().strftime('%y%m%d_%H%M'))) self.w_embed = WordEmbedding(self.embedding_size, self.vocab_size, self.num_sampled, freqs=self.freqs) self.mixture = EmbeddingMixture(self.num_docs, self.num_topics, self.embedding_size) handles = self.retrieve_variables() (self.x, self.y, self.docs, self.step, self.switch_loss, self.word_context, self.doc_context, self.loss_word2vec, self.fraction, self.loss_lda, self.loss, self.loss_avgs_op, self.optimizer, self.merged) = handles def train(self, pivot_ids, target_ids, doc_ids, num_epochs, idx_to_word, switch_loss_epoch=5, save_every=1, report_every=1, print_topics_every=5): data_size = len(pivot_ids) temp_fraction = self.batch_size * 1.0 / data_size self.sesh.run(tf.assign(self.fraction, temp_fraction)) iters_per_epoch = int(data_size / self.batch_size) + np.ceil( data_size % self.batch_size) switch_loss_step = iters_per_epoch * switch_loss_epoch self.sesh.run(tf.assign(self.switch_loss, switch_loss_step)) if self.save_graph: saver = tf.train.Saver() writer = tf.summary.FileWriter(self.logdir + '/', graph=self.sesh.graph) for epoch in range(num_epochs): print('\nEPOCH:', epoch + 1) for pivot, target, doc in chunks(self.batch_size, pivot_ids, target_ids, doc_ids): feed_dict = {self.x: pivot, self.y: target, self.docs: doc} fetches = [ self.merged, self.optimizer, self.loss, self.loss_word2vec, self.loss_lda, self.step ] summary, _, l, lw2v, llda, step = self.sesh.run( fetches, feed_dict=feed_dict) if (epoch + 1) % report_every == 0: print('Loss: ', l, 'Word2Vec Loss: ', lw2v, 'LDA loss: ', llda) if (epoch + 1) % save_every == 0 and self.save_graph: writer.add_summary(summary, step) writer.flush() writer.close() save_path = saver.save(self.sesh, self.logdir + '/model.ckpt') writer = tf.summary.FileWriter(self.logdir + '/', graph=self.sesh.graph) if epoch > 0 and (epoch + 1) % print_topics_every == 0: idxs = np.arange(self.num_topics) words, sims = self.get_k_closest(idxs, idx_to_word=idx_to_word, k=10) if self.save_graph and (epoch + 1) % save_every != 0: writer.add_summary(summary, step) writer.flush() writer.close() save_path = saver.save(self.sesh, self.logdir + '/model.ckpt') def get_k_closest(self, idxs, in_type="topic", vs_type="word", k=10, idx_to_word=None): if not self.computed_norm: self.normed_embed_dict = {} norm = tf.sqrt( tf.reduce_sum(self.mixture.topic_embedding**2, 1, keep_dims=True)) self.normed_embed_dict[ 'topic'] = self.mixture.topic_embedding / norm norm = tf.sqrt( tf.reduce_sum(self.w_embed.embedding**2, 1, keep_dims=True)) self.normed_embed_dict['word'] = self.w_embed.embedding / norm norm = tf.sqrt( tf.reduce_sum(self.mixture.doc_embedding**2, 1, keep_dims=True)) self.normed_embed_dict['doc'] = self.mixture.doc_embedding / norm self.idxs_in = tf.placeholder(tf.int32, shape=[None], name='idxs') self.computed_norm = True self.batch_array = tf.nn.embedding_lookup( self.normed_embed_dict[in_type], self.idxs_in) self.cosine_similarity = tf.matmul( self.batch_array, tf.transpose(self.normed_embed_dict[vs_type], [1, 0])) feed_dict = {self.idxs_in: idxs} sim, sim_idxs = self.sesh.run(tf.nn.top_k(self.cosine_similarity, k=k), feed_dict=feed_dict) if idx_to_word: print( '---------Closest {} words to given indexes----------'.format( k)) for i, idx in enumerate(idxs): in_word = 'Topic ' + str(idx) vs_word_list = [] for vs_i in range(sim_idxs[i].shape[0]): vs_idx = sim_idxs[i][vs_i] vs_word = idx_to_word[vs_idx] vs_word_list.append(vs_word) print(in_word, ':', (', ').join(vs_word_list)) return (sim, sim_idxs) def retrieve_variables(self): x = tf.placeholder(tf.int32, shape=[None], name='x_pivot_idxs') y = tf.placeholder(tf.int64, shape=[None], name='y_target_idxs') docs = tf.placeholder(tf.int32, shape=[None], name='doc_ids') step = tf.Variable(0, trainable=False, name='global_step') switch_loss = tf.Variable(0, trainable=False) word_context = tf.nn.embedding_lookup(self.w_embed.embedding, x, name='word_embed_lookup') doc_context = self.mixture.get_context(doc_ids=docs) contexts_to_add = [word_context, doc_context] context = tf.add_n(contexts_to_add, name='context_vector') with tf.name_scope('nce_loss'): loss_word2vec = self.w_embed.compute_loss(context, y) tf.summary.scalar('nce_loss', loss_word2vec) with tf.name_scope('lda_loss'): fraction = tf.Variable(1, trainable=False, dtype=tf.float32, name='fraction') loss_lda = self.lmbda * fraction * self.prior() tf.summary.scalar('lda_loss', loss_lda) loss = tf.cond(step < switch_loss, lambda: loss_word2vec, lambda: loss_word2vec + loss_lda) loss_avgs_op = self.moving_avgs.apply([loss_lda, loss_word2vec, loss]) with tf.control_dependencies([loss_avgs_op]): optimizer = tf.contrib.layers.optimize_loss( loss, tf.train.get_global_step(), self.learning_rate, 'Adam', name='Optimizer') self.sesh.run( tf.global_variables_initializer(), options=tf.RunOptions(report_tensor_allocations_upon_oom=True)) merged = tf.summary.merge_all() return [ x, y, docs, step, switch_loss, word_context, doc_context, loss_word2vec, fraction, loss_lda, loss, loss_avgs_op, optimizer, merged ] def prior(self): n_topics = self.mixture.doc_embedding.get_shape()[1].value alpha = 1.0 / n_topics log_proportions = tf.nn.log_softmax(self.mixture.doc_embedding) return tf.reduce_sum((alpha - 1.0) * log_proportions)
def train_feedback(nlq, db_name, correct_query, toy, word_emb): """ Arguments: nlq: english question (tokenization is done here) - get from Flask (User) db_name: name of the database the query targets - get from Flask (User) correct_query: the ground truth query supplied by the user(s) - get from Flask toy: uses a small example of word embeddings to debug faster """ ITER = 21 SAVED_MODELS_FOLDER = "saved_models" OUTPUT_PATH = "output_inference.txt" HISTORY_TYPE = "full" GPU_ENABLE = False TRAIN_EMB = False TABLE_TYPE = "std" DATA_ROOT = "generated_data" use_hs = True if HISTORY_TYPE == "no": HISTORY_TYPE = "full" use_hs = False """ Model Hyperparameters """ N_word = 300 # word embedding dimension B_word = 42 # 42B tokens in the Glove pretrained embeddings N_h = 300 # hidden size dimension N_depth = 2 # if toy: USE_SMALL = True # GPU=True GPU = GPU_ENABLE BATCH_SIZE = 20 else: USE_SMALL = False # GPU=True GPU = GPU_ENABLE BATCH_SIZE = 64 # TRAIN_ENTRY=(False, True, False) # (AGG, SEL, COND) # TRAIN_AGG, TRAIN_SEL, TRAIN_COND = TRAIN_ENTRY learning_rate = 1e-4 # GENERATE CORRECT QUERY DATASET table_data_path = "./data/spider/tables.json" table_dict = get_table_dict(table_data_path) train_data_path = "./data/spider/train_spider.json" train_data = json.load(open(train_data_path)) sql = correct_query #"SELECT name , country , age FROM singer ORDER BY age DESC" db_id = db_name #"concert_singer" table_file = table_data_path # "tables.json" schemas, db_names, tables = get_schemas_from_json(table_file) schema = schemas[db_id] table = tables[db_id] schema = Schema(schema, table) sql_label = get_sql(schema, sql) correct_query_data = { "multi_sql_dataset": [], "keyword_dataset": [], "col_dataset": [], "op_dataset": [], "agg_dataset": [], "root_tem_dataset": [], "des_asc_dataset": [], "having_dataset": [], "andor_dataset": [] } parser_item_with_long_history( tokenize(nlq), #item["question_toks"], sql_label, #item["sql"], table_dict[db_name], #table_dict[item["db_id"]], [], correct_query_data) # print("\nCorrect query dataset: {}".format(correct_query_data)) for train_component in TRAIN_COMPONENTS: print("\nTRAIN COMPONENT: {}".format(train_component)) # Check if the compenent to be trained is an actual component if train_component not in TRAIN_COMPONENTS: print("Invalid train component") exit(1) """ Read in the data """ train_data = load_train_dev_dataset(train_component, "train", HISTORY_TYPE, DATA_ROOT) # print("train_data type: {}".format(type(train_data))) dev_data = load_train_dev_dataset(train_component, "dev", HISTORY_TYPE, DATA_ROOT) # sql_data, table_data, val_sql_data, val_table_data, \ # test_sql_data, test_table_data, \ # TRAIN_DB, DEV_DB, TEST_DB = load_dataset(args.dataset, use_small=USE_SMALL) if GPU_ENABLE: map_to = "gpu" else: map_to = "cpu" # Selecting which Model to Train model = None if train_component == "multi_sql": model = MultiSqlPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load( "{}/multi_sql_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "keyword": model = KeyWordPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load( "{}/keyword_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "col": model = ColPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load("{}/col_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "op": model = OpPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load("{}/op_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "agg": model = AggPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load("{}/agg_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "root_tem": model = RootTeminalPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load( "{}/root_tem_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "des_asc": model = DesAscLimitPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load( "{}/des_asc_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "having": model = HavingPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load("{}/having_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) elif train_component == "andor": model = AndOrPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs) model.load_state_dict( torch.load("{}/andor_models.dump".format(SAVED_MODELS_FOLDER), map_location=map_to)) # model = SQLNet(word_emb, N_word=N_word, gpu=GPU, trainable_emb=args.train_emb) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0) print("finished build model") print_flag = False embed_layer = WordEmbedding(word_emb, N_word, gpu=GPU, SQL_TOK=SQL_TOK, trainable=TRAIN_EMB) print("start training") best_acc = 0.0 for i in range(ITER): print('ITER %d @ %s' % (i + 1, datetime.datetime.now())) # arguments of epoch_train # model, optimizer, batch_size, component,embed_layer,data, table_type # print(' Loss = %s' % epoch_train( # model, optimizer, BATCH_SIZE, # args.train_component, # embed_layer, # train_data, # table_type=args.table_type)) print('Total Loss = %s' % epoch_feedback_train(model=model, optimizer=optimizer, batch_size=BATCH_SIZE, component=train_component, embed_layer=embed_layer, data=train_data, table_type=TABLE_TYPE, nlq=nlq, db_name=db_name, correct_query=correct_query, correct_query_data=correct_query_data)) # Check improvement every 10 iterations if i % 10 == 0: acc = epoch_acc(model, BATCH_SIZE, train_component, embed_layer, dev_data, table_type=TABLE_TYPE) if acc > best_acc: best_acc = acc print("Save model...") torch.save( model.state_dict(), SAVED_MODELS_FOLDER + "/{}_models.dump".format(train_component))
logger = logging.getLogger(__name__) # 1. Command line arguments args = sys.argv is_first_time = args[1] parse_type = args[2] embedding_type = args[3] # 2. Loading the data train_seqs, train_y, test_seqs, test_y = get_input(is_first_time=is_first_time, parse_type=parse_type) hparams = HyperParams().get_cnn_hyper_params() # 3. Transform the data using embedding vectors. embedding = WordEmbedding(train_seqs, hparams.embedding_dim) if embedding_type == 'doc2vec': model = embedding.get_d2v_model() else: model = embedding.get_w2v_model() train_X = embedding.get_embedding_mtx(model, train_seqs) test_X = embedding.get_embedding_mtx(model, test_seqs) # 4. Reshape train_X = train_X.reshape( [-1, consts.MAX_SEQUENCE_LENGTH, hparams.embedding_dim]) train_y = train_y.reshape([-1, consts.NUM_LABELS]) test_X = test_X.reshape( [-1, consts.MAX_SEQUENCE_LENGTH, hparams.embedding_dim]) test_y = test_y.reshape([-1, consts.NUM_LABELS])