def __init__(self,
                 vocab_size: int,
                 head_num: int = 8,
                 hidden_dim: int = 512,
                 dropout_rate: float = 0.1,
                 max_len: int = 50,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.vocab_size = vocab_size
        self.head_num = head_num
        self.hidden_dim = hidden_dim
        self.dropout_rate = dropout_rate
        self.max_len = max_len

        # Encoder側埋め込み層
        self.enc_embedding = WordEmbedding(vocab_size=vocab_size,
                                           embedding_dim=hidden_dim)
        # Encoder
        self.encoder = Encoder(vocab_size=vocab_size,
                               hidden_dim=hidden_dim,
                               dropout_rate=dropout_rate)

        # Decoder側埋め込み層
        self.dec_embedding = WordEmbedding(vocab_size=vocab_size,
                                           embedding_dim=hidden_dim)
        # Decoder
        self.decoder = Decoder(vocab_size=vocab_size,
                               hidden_dim=hidden_dim,
                               dropout_rate=dropout_rate)
Example #2
0
def create_fasttext_model(corpus_file, method='cbow', out_file=None, **kwargs):
    # type: (Path, str, Path, **Any) -> WordEmbedding
    """Load or create a FastText word embedding.

    Parameters:
        corpus_file (Path): The path of the corpus file.
        method (str): The model type. Must be either 'cbow' or 'skipgram'.
        out_file (Path): The output path of the model. Optional.
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The trained FastText model.

    Raises:
        ValueError: If method is not 'cbow' or 'skipgram'.
    """
    if method not in {'cbow', 'skipgram'}:
        raise ValueError(f'method must be "cbow" or "skipgram" but got "{method}"')
    if out_file is None:
        out_file = MODELS_PATH.joinpath(corpus_file.name + f'.fasttext.{method}')
    if not out_file.exists():
        binary_file = out_file.parent.joinpath(out_file.name + '.bin')
        if not binary_file.exists():
            subprocess.run(
                [
                    'fasttext', method,
                    '-input', str(corpus_file),
                    '-output', str(out_file),
                ],
                check=True,
            )
        embedding = WordEmbedding.load_fasttext_file(binary_file)
        embedding.save(out_file)
    return WordEmbedding.load_word2vec_file(out_file)
Example #3
0
def main(_):

    # init
    we = WordEmbedding()
    dc = Document()
    cf = Classifier()

    # load data
    docs = dc.getDocs(labeled_only=True)

    # load word embedding model
    if FLAGS.we_model == 'devblog':
        we_model = we.loadDevblogModel(embedding_dim = FLAGS.we_dim,
                                       epochs        = FLAGS.we_epoch,
                                       window        = FLAGS.we_window,
                                       min_count     = FLAGS.we_min_count)
        # han2jamo
        docs.text = docs.text.apply(han2Jamo)
    elif FLAGS.we_model == 'wiki':
        we_model = we.loadWikiModel()

    # word embedding
    docs.vector = docs.text.apply(lambda x: we.embedding(we_model, x))

    # training
    cf_model = cf.train(docs, './checkpoint')
    cf.saveModel(cf_model, FLAGS.cf_model)
    def __init__(self, word_emb, N_word, N_h=300, N_depth=2, 
                gpu=True, trainable_emb=False, 
                table_type="std", use_hs=True):
        super(SuperModel, self).__init__()
        self.gpu = gpu
        self.N_h = N_h
        self.N_depth = N_depth
        self.trainable_emb = trainable_emb
        self.table_type = table_type
        self.use_hs = use_hs
        self.SQL_TOK = ['<UNK>', '<END>', 'WHERE', 'AND', 'EQL', 'GT', 'LT', '<BEG>']

        # word embedding layer
        self.embed_layer = WordEmbedding(word_emb, N_word, gpu,
                self.SQL_TOK, trainable=trainable_emb)

        # initial all modules
        self.multi_sql = MultiSqlPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs)
        self.multi_sql.eval()

        self.key_word = KeyWordPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,
                                        gpu=gpu, use_hs=use_hs)
        self.key_word.eval()

        self.col = ColPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs)
        self.col.eval()

        self.op = OpPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs)
        self.op.eval()

        self.agg = AggPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs)
        self.agg.eval()

        self.root_teminal = RootTeminalPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs)
        self.root_teminal.eval()

        self.des_asc = DesAscLimitPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs)
        self.des_asc.eval()

        self.having = HavingPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth,gpu=gpu, use_hs=use_hs)
        self.having.eval()

        self.andor = AndOrPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=gpu, use_hs=use_hs)
        self.andor.eval()

        self.softmax = nn.Softmax(dim = 1) #dim=1
        self.CE = nn.CrossEntropyLoss()
        self.log_softmax = nn.LogSoftmax()
        self.mlsml = nn.MultiLabelSoftMarginLoss()
        self.bce_logit = nn.BCEWithLogitsLoss()
        self.sigm = nn.Sigmoid()
        if gpu:
            self.cuda()
        self.path_not_found = 0
Example #5
0
def main(url="None"):
    # We create an instance of the word embedding
    wemb = WordEmbedding()

    # We define a data
    dataset = None

    # We will open the file(s)
    with open(url) as json_file:
        dataset = json.load(json_file)
        for data in dataset:
            d = data.get("text", "")
            print(d)

            # We get the words in the sentences
            words = d.split()

            # We get a dictionary to relate the words to their most similars
            embedded_text = ""

            # We iterate for each word and we get the word embedding
            for w in words:
                # We check if the word embedding produces results.
                # similars_list = wemb.get_most_similars(w)
                # print(similars_list)
                try:
                    similars_list = wemb.get_most_similars(w)
                    # We sort the list
                    sorted_list = Sort(similars_list)
                    np_array = np.array(sorted_list)

                    embedded_text += np_array[0, 0] + " "
                except:
                    # We concatenate the original word in case we coudln't find it in the word embedding
                    embedded_text += w + " "

            data['embedded_text'] = embedded_text
            print("We have ended searching the words")

        # new URL
        new_url = new_url = url.split("/")[1].split(".")[0]
        # We create a new file for each hash tag file that we consulted.
        with open('Embedded_Results/' + new_url + "_embedded" + '.json',
                  'w') as outfile:
            # We finally dump the tweets + the overall_score in a json file.
            json.dump(dataset, outfile)

    # w_embedding.run()
    print("Exiting main")
Example #6
0
    def __init__(self,
                 num_docs,
                 vocab_size,
                 num_topics,
                 embedding_size,
                 freqs,
                 batch_size,
                 save_graph,
                 num_sampled=40):
        self.num_docs = num_docs
        self.vocab_size = vocab_size
        self.num_topics = num_topics
        self.embedding_size = embedding_size
        self.freqs = freqs
        self.batch_size = batch_size
        self.save_graph = save_graph
        self.num_sampled = num_sampled
        self.lmbda = 200.0
        self.learning_rate = 0.001
        self.moving_avgs = tf.train.ExponentialMovingAverage(0.9)
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.sesh = tf.Session(config=self.config)
        self.computed_norm = False

        self.logdir = "_".join(
            ("lda2vec", datetime.now().strftime('%y%m%d_%H%M')))

        self.w_embed = WordEmbedding(self.embedding_size,
                                     self.vocab_size,
                                     self.num_sampled,
                                     freqs=self.freqs)

        self.mixture = EmbeddingMixture(self.num_docs, self.num_topics,
                                        self.embedding_size)

        handles = self.retrieve_variables()

        (self.x, self.y, self.docs, self.step, self.switch_loss,
         self.word_context, self.doc_context, self.loss_word2vec,
         self.fraction, self.loss_lda, self.loss, self.loss_avgs_op,
         self.optimizer, self.merged) = handles
Example #7
0
def run(dim, epochs):
    cooc = _load_cooc_matrix()
    vocab = _load_vocab()
    print(vocab)
    word_vectors, _ = _train_embeddings(cooc, dim=dim, epochs=epochs)
    word_emb = WordEmbedding(word_vectors, vocab)

    # Save results to file
    path_word_emb = constants.GLOVE_EMBEDDING_CIL_PATH
    pickle.dump(word_emb, open(path_word_emb, "wb"))
    print("Finished saving embeddings at %s" % path_word_emb)
Example #8
0
def _load_word2vec_embedding():
    '''
    Loads the word2vec embedding using a binary file. The
    word2vec embeddings are very large and cannot be pickled
    as a WordEmbedding object.
    '''
    print("Loading word2vec embeddings, this may take a while...")
    w2v_model = gensim\
        .models.KeyedVectors\
        .load_word2vec_format(constants.WORD2VEC_EMBEDDING_PATH, binary=True)
    vocab = {k: i for i, k in enumerate(w2v_model.vocab)}
    return WordEmbedding(w2v_model.vectors, vocab)
Example #9
0
    def __init__(self,
                 word_emb,
                 num_words,
                 num_hidden=100,
                 num_layers=2,
                 use_gpu=True):
        super(Seq2SQL, self).__init__()

        self.word_emb = word_emb
        self.num_words = num_words
        self.num_hidden = num_hidden
        self.num_layers = num_layers
        self.use_gpu = use_gpu

        self.max_col_num = 45
        self.max_tok_num = 200
        self.COND_OPS = ['EQL', 'GT', 'LT']
        self.SQL_TOK = ['<UNK>', '<BEG>', '<END>', 'WHERE', 'AND'
                        ] + self.COND_OPS

        # GloVe Word Embedding
        self.embed_layer = WordEmbedding(word_emb, num_words, self.SQL_TOK,
                                         use_gpu)

        # Aggregation Classifier
        self.agg_classifier = AggregationClassifier(num_words, num_hidden,
                                                    num_layers)

        # SELECT Column(s)
        self.sel_classifier = SelectClassifier(num_words, num_hidden,
                                               num_layers, self.max_tok_num)

        # WHERE Clause
        self.whr_classifier = WhereClassifier(num_words, num_hidden,
                                              num_layers, self.max_col_num,
                                              self.max_tok_num, use_gpu)

        # run on GPU
        if use_gpu:
            self.cuda()
Example #10
0
def bolukbasi_debias_generalized(embedding,
                                 words,
                                 out_file,
                                 excludes=None,
                                 **kwargs):
    # type: (WordEmbedding, Iterable[str], Path, Iterable[str], **Any) -> WordEmbedding
    """Debias a word embedding using a generalized version of Bolukbasi's algorithm.

    Parameters:
        embedding (WordEmbedding): The word embedding to debias.
        words (Iterable[str]): A list of words that define the bias subspace.
        out_file (Path): The path to save the new embedding to.
        excludes (Iterable[str]): A collection of words to be excluded from the debiasing
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The debiased word embedding.
    """
    if out_file.exists():
        return WordEmbedding.load_word2vec_file(out_file)
    matrix = recenter(
        np.array([embedding[word] for word in words if word in embedding]))
    bias_subspace = _define_pca_bias_subspace(matrix, **kwargs)
    bias_subspace = bias_subspace[np.newaxis, :]
    # debias by rejecting the subspace and reverting the excluded words
    if excludes is None:
        excludes = set()
    new_vectors = reject(embedding.vectors, bias_subspace)
    for word in excludes:
        if word in embedding:
            new_vectors[embedding.index(word)] = embedding[word]
    new_vectors = normalize(new_vectors)
    # create a word embedding from the new vectors
    new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors)
    new_embedding.source = out_file
    new_embedding.save()
    return new_embedding
Example #11
0
def main(_):
    # init
    we = WordEmbedding()
    dc = Document()
    cf = Classifier()

    # load word embedding model
    if FLAGS.we_model == 'devblog':
        we_model = we.loadDevblogModel(embedding_dim=FLAGS.we_dim,
                                       epochs=FLAGS.we_epoch,
                                       window=FLAGS.we_window,
                                       min_count=FLAGS.we_min_count)
    elif FLAGS.we_model == 'wiki':
        we_model = we.loadWikiModel()

    # load classifier model
    cf_model = cf.loadModel(FLAGS.cf_model)

    results = [{'text': r} for r in FLAGS.predict]
    is_devblog = FLAGS.we_model == 'devblog'
    for i, r in enumerate(FLAGS.predict):
        # preprocessing
        text = han2Jamo(r) if is_devblog else r

        # word embedding
        df = dc.preprocessing(text, devblog=is_devblog)
        vector = df.text.apply(
            lambda x: we.embedding(we_model, x, FLAGS.we_dim)).tolist()
        if len(vector) == 0:
            print('🐈 text is not valid :', r)
            return
        else:
            # predict
            results[i]['predict'] = cf.predict(cf_model, np.array(vector),
                                               FLAGS.criterion)

    return results
def adapt_embed(path, bin_path, embed_path, strategy, source_lang,
                target_lang):
    # process enconder [MA]
    embed_encoder_path = os.path.join(path, 'tmp',
                                      'generated_embeds_encoder.txt')
    word_embedding = WordEmbedding(
        os.path.join(os.path.abspath(embed_path), 'best_embeds_encoder.txt'),
        os.path.join(os.path.abspath(bin_path), f"dict.{source_lang}.txt"))
    word_embedding.process_embed(strategy, embed_encoder_path)
    # process decoder [MA]
    embed_decoder_path = os.path.join(path, 'tmp',
                                      'generated_embeds_decoder.txt')
    word_embedding = WordEmbedding(
        os.path.join(os.path.abspath(embed_path), 'best_embeds_decoder.txt'),
        os.path.join(os.path.abspath(bin_path), f"dict.{target_lang}.txt"))
    word_embedding.process_embed(strategy, embed_decoder_path)
    return embed_encoder_path, embed_decoder_path
	def main(self):
		'''
		This is the main function for performing the 
		Document Clustering.
		'''

		# Create object of ConfigParser Class
		config_obj = ConfigParse()
    
		# Parse config file
		print 'READING CONFIG FILE'
		config_obj.config_reader()

		# Create object of WordEmbedding Class
		word_embedding_obj = WordEmbedding(config_obj.input_file_path,
    						 config_obj.word2vec_model,
    						 config_obj.word_vector_dim)

		print 'CONVERTING INPUT SENTENCES TO VECTORS'
		embedding_file = word_embedding_obj.sentence_to_vector()

		# Create object of Clustering Class
		clustering_obj = Clustering(embedding_file, config_obj.output_dir_path,
    					config_obj.threshold, config_obj.representative_word_vector,
    					config_obj.cluster_overlap, config_obj.word_vector_dim)

		print 'CLUSTERING SENTENCES'
		num_of_clusters = clustering_obj.cluster_sentences()
		print str(num_of_clusters) + ' NUMBER OF CLUSTERS ARE GENERATED.'

		# Remove Temporary Files
		os.remove(embedding_file)
		for subdir, dirs, cluster_files in os.walk(config_obj.output_dir_path):
			for cluster_file in cluster_files:
				if 'rep_' in cluster_file:
					os.remove(config_obj.output_dir_path + '/' + cluster_file)
Example #14
0
                               use_hs=use_hs)
        model.load_state_dict(
            torch.load("{}/andor_models.dump".format(SAVED_MODELS_FOLDER),
                       map_location=map_to))

    # model = SQLNet(word_emb, N_word=N_word, gpu=GPU, trainable_emb=args.train_emb)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=0)
    print("finished building model")

    print_flag = False
    embed_layer = WordEmbedding(word_emb,
                                N_word,
                                gpu=GPU,
                                SQL_TOK=SQL_TOK,
                                trainable=args.train_emb)

    print("Dev Accuracy")
    # best_acc = 0.0
    # for i in range(args.epoch):
    #     print('Epoch %d @ %s'%(i+1, datetime.datetime.now()))
    # arguments of epoch_train
    # model, optimizer, batch_size, component,embed_layer,data, table_type
    # print(' Loss = %s' % epoch_train(
    #                     model, optimizer, BATCH_SIZE,
    #                     args.train_component,
    #                     embed_layer,
    #                     train_data,
    #                     table_type=args.table_type))
Example #15
0
class SuperModel(nn.Module):
    def __init__(self,
                 word_emb,
                 N_word,
                 N_h=300,
                 N_depth=2,
                 gpu=True,
                 trainable_emb=False,
                 table_type="std",
                 use_hs=True):
        super(SuperModel, self).__init__()
        self.gpu = gpu
        self.N_h = N_h
        self.N_depth = N_depth
        self.trainable_emb = trainable_emb
        self.table_type = table_type
        self.use_hs = use_hs
        self.SQL_TOK = [
            '<UNK>', '<END>', 'WHERE', 'AND', 'EQL', 'GT', 'LT', '<BEG>'
        ]

        # word embedding layer
        self.embed_layer = WordEmbedding(word_emb,
                                         N_word,
                                         gpu,
                                         self.SQL_TOK,
                                         trainable=trainable_emb)

        # initial all modules
        self.multi_sql = MultiSqlPredictor(N_word=N_word,
                                           N_h=N_h,
                                           N_depth=N_depth,
                                           gpu=gpu,
                                           use_hs=use_hs)
        self.multi_sql.eval()

        self.key_word = KeyWordPredictor(N_word=N_word,
                                         N_h=N_h,
                                         N_depth=N_depth,
                                         gpu=gpu,
                                         use_hs=use_hs)
        self.key_word.eval()

        self.col = ColPredictor(N_word=N_word,
                                N_h=N_h,
                                N_depth=N_depth,
                                gpu=gpu,
                                use_hs=use_hs)
        self.col.eval()

        self.op = OpPredictor(N_word=N_word,
                              N_h=N_h,
                              N_depth=N_depth,
                              gpu=gpu,
                              use_hs=use_hs)
        self.op.eval()

        self.agg = AggPredictor(N_word=N_word,
                                N_h=N_h,
                                N_depth=N_depth,
                                gpu=gpu,
                                use_hs=use_hs)
        self.agg.eval()

        self.root_teminal = RootTeminalPredictor(N_word=N_word,
                                                 N_h=N_h,
                                                 N_depth=N_depth,
                                                 gpu=gpu,
                                                 use_hs=use_hs)
        self.root_teminal.eval()

        self.des_asc = DesAscLimitPredictor(N_word=N_word,
                                            N_h=N_h,
                                            N_depth=N_depth,
                                            gpu=gpu,
                                            use_hs=use_hs)
        self.des_asc.eval()

        self.having = HavingPredictor(N_word=N_word,
                                      N_h=N_h,
                                      N_depth=N_depth,
                                      gpu=gpu,
                                      use_hs=use_hs)
        self.having.eval()

        self.andor = AndOrPredictor(N_word=N_word,
                                    N_h=N_h,
                                    N_depth=N_depth,
                                    gpu=gpu,
                                    use_hs=use_hs)
        self.andor.eval()

        self.softmax = nn.Softmax()  #dim=1
        self.CE = nn.CrossEntropyLoss()
        self.log_softmax = nn.LogSoftmax()
        self.mlsml = nn.MultiLabelSoftMarginLoss()
        self.bce_logit = nn.BCEWithLogitsLoss()
        self.sigm = nn.Sigmoid()
        if gpu:
            self.cuda()
        self.path_not_found = 0

    def forward(self, q_seq, history, tables):
        # if self.part:
        #     return self.part_forward(q_seq,history,tables)
        # else:
        return self.full_forward(q_seq, history, tables)

    def full_forward(self, q_seq, history, tables):
        B = len(q_seq)
        # print("q_seq:{}".format(q_seq))
        # print("Batch size:{}".format(B))
        q_emb_var, q_len = self.embed_layer.gen_x_q_batch(q_seq)
        col_seq = to_batch_tables(tables, B, self.table_type)
        col_emb_var, col_name_len, col_len = self.embed_layer.gen_col_batch(
            col_seq)

        mkw_emb_var = self.embed_layer.gen_word_list_embedding(
            ["none", "except", "intersect", "union"], (B))
        mkw_len = np.full(q_len.shape, 4, dtype=np.int64)
        kw_emb_var = self.embed_layer.gen_word_list_embedding(
            ["where", "group by", "order by"], (B))
        kw_len = np.full(q_len.shape, 3, dtype=np.int64)

        stack = Stack()
        stack.push(("root", None))
        history = [["root"]] * B
        andor_cond = ""
        has_limit = False
        # sql = {}
        current_sql = {}
        sql_stack = []
        idx_stack = []
        kw_stack = []
        kw = ""
        nested_label = ""
        has_having = False

        timeout = time.time(
        ) + 2  # set timer to prevent infinite recursion in SQL generation
        failed = False
        while not stack.isEmpty():
            if time.time() > timeout:
                failed = True
                break
            vet = stack.pop()
            # print(vet)
            hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch(history)
            if len(idx_stack) > 0 and stack.size() < idx_stack[-1]:
                # print("pop!!!!!!!!!!!!!!!!!!!!!!")
                idx_stack.pop()
                current_sql = sql_stack.pop()
                kw = kw_stack.pop()
                # current_sql = current_sql["sql"]
            # history.append(vet)
            # print("hs_emb:{} hs_len:{}".format(hs_emb_var.size(),hs_len.size()))
            if isinstance(vet, tuple) and vet[0] == "root":
                if history[0][-1] != "root":
                    history[0].append("root")
                    hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch(
                        history)
                if vet[1] != "original":
                    idx_stack.append(stack.size())
                    sql_stack.append(current_sql)
                    kw_stack.append(kw)
                else:
                    idx_stack.append(stack.size())
                    sql_stack.append(sql_stack[-1])
                    kw_stack.append(kw)
                if "sql" in current_sql:
                    current_sql["nested_sql"] = {}
                    current_sql["nested_label"] = nested_label
                    current_sql = current_sql["nested_sql"]
                elif isinstance(vet[1], dict):
                    vet[1]["sql"] = {}
                    current_sql = vet[1]["sql"]
                elif vet[1] != "original":
                    current_sql["sql"] = {}
                    current_sql = current_sql["sql"]
                # print("q_emb_var:{} hs_emb_var:{} mkw_emb_var:{}".format(q_emb_var.size(),hs_emb_var.size(),mkw_emb_var.size()))
                if vet[1] == "nested" or vet[1] == "original":
                    stack.push("none")
                    history[0].append("none")
                else:
                    score = self.multi_sql.forward(q_emb_var, q_len,
                                                   hs_emb_var, hs_len,
                                                   mkw_emb_var, mkw_len)
                    label = np.argmax(score[0].data.cpu().numpy())
                    label = SQL_OPS[label]
                    history[0].append(label)
                    stack.push(label)
                if label != "none":
                    nested_label = label

            elif vet in ('intersect', 'except', 'union'):
                stack.push(("root", "nested"))
                stack.push(("root", "original"))
                # history[0].append("root")
            elif vet == "none":
                score = self.key_word.forward(q_emb_var, q_len, hs_emb_var,
                                              hs_len, kw_emb_var, kw_len)
                kw_num_score, kw_score = [x.data.cpu().numpy() for x in score]
                # print("kw_num_score:{}".format(kw_num_score))
                # print("kw_score:{}".format(kw_score))
                num_kw = np.argmax(kw_num_score[0])
                kw_score = list(np.argsort(-kw_score[0])[:num_kw])
                kw_score.sort(reverse=True)
                # print("num_kw:{}".format(num_kw))
                for kw in kw_score:
                    stack.push(KW_OPS[kw])
                stack.push("select")
            elif vet in ("select", "orderBy", "where", "groupBy", "having"):
                kw = vet
                current_sql[kw] = []
                history[0].append(vet)
                stack.push(("col", vet))
                # score = self.andor.forward(q_emb_var,q_len,hs_emb_var,hs_len)
                # label = score[0].data.cpu().numpy()
                # andor_cond = COND_OPS[label]
                # history.append("")
            # elif vet == "groupBy":
            #     score = self.having.forward(q_emb_var,q_len,hs_emb_var,hs_len,col_emb_var,col_len,)
            elif isinstance(vet, tuple) and vet[0] == "col":
                # print("q_emb_var:{} hs_emb_var:{} col_emb_var:{}".format(q_emb_var.size(), hs_emb_var.size(),col_emb_var.size()))
                score = self.col.forward(q_emb_var, q_len, hs_emb_var, hs_len,
                                         col_emb_var, col_len, col_name_len)
                col_num_score, col_score = [
                    x.data.cpu().numpy() for x in score
                ]
                col_num = np.argmax(col_num_score[0]) + 1  # double check
                cols = np.argsort(-col_score[0])[:col_num]
                # print(col_num)
                # print("col_num_score:{}".format(col_num_score))
                # print("col_score:{}".format(col_score))
                for col in cols:
                    if vet[1] == "where":
                        stack.push(("op", "where", col))
                    elif vet[1] != "groupBy":
                        stack.push(("agg", vet[1], col))
                    elif vet[1] == "groupBy":
                        history[0].append(index_to_column_name(col, tables))
                        current_sql[kw].append(
                            index_to_column_name(col, tables))
                #predict and or or when there is multi col in where condition
                if col_num > 1 and vet[1] == "where":
                    score = self.andor.forward(q_emb_var, q_len, hs_emb_var,
                                               hs_len)
                    label = np.argmax(score[0].data.cpu().numpy())
                    andor_cond = COND_OPS[label]
                    current_sql[kw].append(andor_cond)
                if vet[1] == "groupBy" and col_num > 0:
                    score = self.having.forward(
                        q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var,
                        col_len, col_name_len,
                        np.full(B, cols[0], dtype=np.int64))
                    label = np.argmax(score[0].data.cpu().numpy())
                    if label == 1:
                        has_having = (label == 1)
                        # stack.insert(-col_num,"having")
                        stack.push("having")
                # history.append(index_to_column_name(cols[-1], tables[0]))
            elif isinstance(vet, tuple) and vet[0] == "agg":
                history[0].append(index_to_column_name(vet[2], tables))
                if vet[1] not in ("having", "orderBy"):  #DEBUG-ed 20180817
                    try:
                        current_sql[kw].append(
                            index_to_column_name(vet[2], tables))
                    except Exception as e:
                        # print(e)
                        traceback.print_exc()
                        print("history:{},current_sql:{} stack:{}".format(
                            history[0], current_sql, stack.items))
                        print("idx_stack:{}".format(idx_stack))
                        print("sql_stack:{}".format(sql_stack))
                        exit(1)
                hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch(
                    history)

                score = self.agg.forward(q_emb_var, q_len, hs_emb_var, hs_len,
                                         col_emb_var, col_len, col_name_len,
                                         np.full(B, vet[2], dtype=np.int64))
                agg_num_score, agg_score = [
                    x.data.cpu().numpy() for x in score
                ]
                agg_num = np.argmax(agg_num_score[0])  # double check
                agg_idxs = np.argsort(-agg_score[0])[:agg_num]
                # print("agg:{}".format([AGG_OPS[agg] for agg in agg_idxs]))
                if len(agg_idxs) > 0:
                    history[0].append(AGG_OPS[agg_idxs[0]])
                    if vet[1] not in ("having", "orderBy"):
                        current_sql[kw].append(AGG_OPS[agg_idxs[0]])
                    elif vet[1] == "orderBy":
                        stack.push(("des_asc", vet[2],
                                    AGG_OPS[agg_idxs[0]]))  #DEBUG-ed 20180817
                    else:
                        stack.push(
                            ("op", "having", vet[2], AGG_OPS[agg_idxs[0]]))
                for agg in agg_idxs[1:]:
                    history[0].append(index_to_column_name(vet[2], tables))
                    history[0].append(AGG_OPS[agg])
                    if vet[1] not in ("having", "orderBy"):
                        current_sql[kw].append(
                            index_to_column_name(vet[2], tables))
                        current_sql[kw].append(AGG_OPS[agg])
                    elif vet[1] == "orderBy":
                        stack.push(("des_asc", vet[2], AGG_OPS[agg]))
                    else:
                        stack.push(("op", "having", vet[2], agg_idxs))
                if len(agg_idxs) == 0:
                    if vet[1] not in ("having", "orderBy"):
                        current_sql[kw].append("none_agg")
                    elif vet[1] == "orderBy":
                        stack.push(("des_asc", vet[2], "none_agg"))
                    else:
                        stack.push(("op", "having", vet[2], "none_agg"))
                # current_sql[kw].append([AGG_OPS[agg] for agg in agg_idxs])
                # if vet[1] == "having":
                #     stack.push(("op","having",vet[2],agg_idxs))
                # if vet[1] == "orderBy":
                #     stack.push(("des_asc",vet[2],agg_idxs))
                # if vet[1] == "groupBy" and has_having:
                #     stack.push("having")
            elif isinstance(vet, tuple) and vet[0] == "op":
                if vet[1] == "where":
                    # current_sql[kw].append(index_to_column_name(vet[2], tables))
                    history[0].append(index_to_column_name(vet[2], tables))
                    hs_emb_var, hs_len = self.embed_layer.gen_x_history_batch(
                        history)

                score = self.op.forward(q_emb_var, q_len, hs_emb_var, hs_len,
                                        col_emb_var, col_len, col_name_len,
                                        np.full(B, vet[2], dtype=np.int64))

                op_num_score, op_score = [x.data.cpu().numpy() for x in score]
                op_num = np.argmax(
                    op_num_score[0]
                ) + 1  # num_score 0 maps to 1 in truth, must have at least one op
                ops = np.argsort(-op_score[0])[:op_num]
                # current_sql[kw].append([NEW_WHERE_OPS[op] for op in ops])
                if op_num > 0:
                    history[0].append(NEW_WHERE_OPS[ops[0]])
                    if vet[1] == "having":
                        stack.push(("root_teminal", vet[2], vet[3], ops[0]))
                    else:
                        stack.push(("root_teminal", vet[2], ops[0]))
                    # current_sql[kw].append(NEW_WHERE_OPS[ops[0]])
                for op in ops[1:]:
                    history[0].append(index_to_column_name(vet[2], tables))
                    history[0].append(NEW_WHERE_OPS[op])
                    # current_sql[kw].append(index_to_column_name(vet[2], tables))
                    # current_sql[kw].append(NEW_WHERE_OPS[op])
                    if vet[1] == "having":
                        stack.push(("root_teminal", vet[2], vet[3], op))
                    else:
                        stack.push(("root_teminal", vet[2], op))
                # stack.push(("root_teminal",vet[2]))
            elif isinstance(vet, tuple) and vet[0] == "root_teminal":
                score = self.root_teminal.forward(
                    q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len,
                    col_name_len, np.full(B, vet[1], dtype=np.int64))

                label = np.argmax(score[0].data.cpu().numpy())
                label = ROOT_TERM_OPS[label]
                if len(vet) == 4:
                    current_sql[kw].append(index_to_column_name(
                        vet[1], tables))
                    current_sql[kw].append(vet[2])
                    current_sql[kw].append(NEW_WHERE_OPS[vet[3]])
                else:
                    # print("kw:{}".format(kw))
                    try:
                        current_sql[kw].append(
                            index_to_column_name(vet[1], tables))
                    except Exception as e:
                        # print(e)
                        traceback.print_exc()
                        print("history:{},current_sql:{} stack:{}".format(
                            history[0], current_sql, stack.items))
                        print("idx_stack:{}".format(idx_stack))
                        print("sql_stack:{}".format(sql_stack))
                        exit(1)
                    current_sql[kw].append(NEW_WHERE_OPS[vet[2]])
                if label == "root":
                    history[0].append("root")
                    current_sql[kw].append({})
                    # current_sql = current_sql[kw][-1]
                    stack.push(("root", current_sql[kw][-1]))
                else:
                    current_sql[kw].append("terminal")
            elif isinstance(vet, tuple) and vet[0] == "des_asc":
                current_sql[kw].append(index_to_column_name(vet[1], tables))
                current_sql[kw].append(vet[2])
                score = self.des_asc.forward(
                    q_emb_var, q_len, hs_emb_var, hs_len, col_emb_var, col_len,
                    col_name_len, np.full(B, vet[1], dtype=np.int64))
                label = np.argmax(score[0].data.cpu().numpy())
                dec_asc, has_limit = DEC_ASC_OPS[label]
                history[0].append(dec_asc)
                current_sql[kw].append(dec_asc)
                current_sql[kw].append(has_limit)
        # print("{}".format(current_sql))

        if failed: return None
        print("history:{}".format(history[0]))
        if len(sql_stack) > 0:
            current_sql = sql_stack[0]
        # print("{}".format(current_sql))
        return current_sql

    def gen_col(self, col, table, table_alias_dict):
        colname = table["column_names_original"][col[2]][1]
        table_idx = table["column_names_original"][col[2]][0]
        if table_idx not in table_alias_dict:
            return colname
        return "T{}.{}".format(table_alias_dict[table_idx], colname)

    def gen_group_by(self, sql, kw, table, table_alias_dict):
        ret = []
        for i in range(0, len(sql)):
            # if len(sql[i+1]) == 0:
            # if sql[i+1] == "none_agg":
            ret.append(self.gen_col(sql[i], table, table_alias_dict))
            # else:
            #     ret.append("{}({})".format(sql[i+1], self.gen_col(sql[i], table, table_alias_dict)))
            # for agg in sql[i+1]:
            #     ret.append("{}({})".format(agg,gen_col(sql[i],table,table_alias_dict)))
        return "{} {}".format(kw, ",".join(ret))

    def gen_select(self, sql, kw, table, table_alias_dict):
        ret = []
        for i in range(0, len(sql), 2):
            # if len(sql[i+1]) == 0:
            if sql[i + 1] == "none_agg" or not isinstance(
                    sql[i + 1], basestring):  #DEBUG-ed 20180817
                ret.append(self.gen_col(sql[i], table, table_alias_dict))
            else:
                ret.append("{}({})".format(
                    sql[i + 1], self.gen_col(sql[i], table, table_alias_dict)))
            # for agg in sql[i+1]:
            #     ret.append("{}({})".format(agg,gen_col(sql[i],table,table_alias_dict)))
        return "{} {}".format(kw, ",".join(ret))

    def gen_where(self, sql, table, table_alias_dict):
        if len(sql) == 0:
            return ""
        start_idx = 0
        andor = "and"
        if isinstance(sql[0], basestring):
            start_idx += 1
            andor = sql[0]
        ret = []
        for i in range(start_idx, len(sql), 3):
            col = self.gen_col(sql[i], table, table_alias_dict)
            op = sql[i + 1]
            val = sql[i + 2]
            where_item = ""
            if val == "terminal":
                where_item = "{} {} '{}'".format(col, op, val)
            else:
                val = self.gen_sql(val, table)
                where_item = "{} {} ({})".format(col, op, val)
            if op == "between":
                #TODO temprarily fixed
                where_item += " and 'terminal'"
            ret.append(where_item)
        return "where {}".format(" {} ".format(andor).join(ret))

    def gen_orderby(self, sql, table, table_alias_dict):
        ret = []
        limit = ""
        if sql[-1] == True:
            limit = "limit 1"
        for i in range(0, len(sql), 4):
            if sql[i + 1] == "none_agg" or not isinstance(
                    sql[i + 1], basestring):  #DEBUG-ed 20180817
                ret.append("{} {}".format(
                    self.gen_col(sql[i], table, table_alias_dict), sql[i + 2]))
            else:
                ret.append("{}({}) {}".format(
                    sql[i + 1], self.gen_col(sql[i], table, table_alias_dict),
                    sql[i + 2]))
        return "order by {} {}".format(",".join(ret), limit)

    def gen_having(self, sql, table, table_alias_dict):
        ret = []
        for i in range(0, len(sql), 4):
            if sql[i + 1] == "none_agg":
                col = self.gen_col(sql[i], table, table_alias_dict)
            else:
                col = "{}({})".format(
                    sql[i + 1], self.gen_col(sql[i], table, table_alias_dict))
            op = sql[i + 2]
            val = sql[i + 3]
            if val == "terminal":
                ret.append("{} {} '{}'".format(col, op, val))
            else:
                val = self.gen_sql(val, table)
                ret.append("{} {} ({})".format(col, op, val))
        return "having {}".format(",".join(ret))

    def find_shortest_path(self, start, end, graph):
        stack = [[start, []]]
        visited = set()
        while len(stack) > 0:
            ele, history = stack.pop()
            if ele == end:
                return history
            for node in graph[ele]:
                if node[0] not in visited:
                    stack.append((node[0], history + [(node[0], node[1])]))
                    visited.add(node[0])
        print("table {} table {}".format(start, end))
        # print("could not find path!!!!!{}".format(self.path_not_found))
        self.path_not_found += 1
        # return []
    def gen_from(self, candidate_tables, table):
        def find(d, col):
            if d[col] == -1:
                return col
            return find(d, d[col])

        def union(d, c1, c2):
            r1 = find(d, c1)
            r2 = find(d, c2)
            if r1 == r2:
                return
            d[r1] = r2

        ret = ""
        if len(candidate_tables) <= 1:
            if len(candidate_tables) == 1:
                ret = "from {}".format(
                    table["table_names_original"][list(candidate_tables)[0]])
            else:
                ret = "from {}".format(table["table_names_original"][0])
            #TODO: temporarily settings
            return {}, ret
        # print("candidate:{}".format(candidate_tables))
        table_alias_dict = {}
        uf_dict = {}
        for t in candidate_tables:
            uf_dict[t] = -1
        idx = 1
        graph = defaultdict(list)
        for acol, bcol in table["foreign_keys"]:
            t1 = table["column_names"][acol][0]
            t2 = table["column_names"][bcol][0]
            graph[t1].append((t2, (acol, bcol)))
            graph[t2].append((t1, (bcol, acol)))
            # if t1 in candidate_tables and t2 in candidate_tables:
            #     r1 = find(uf_dict,t1)
            #     r2 = find(uf_dict,t2)
            #     if r1 == r2:
            #         continue
            #     union(uf_dict,t1,t2)
            #     if len(ret) == 0:
            #         ret = "from {} as T{} join {} as T{} on T{}.{}=T{}.{}".format(table["table_names"][t1],idx,table["table_names"][t2],
            #                                                                       idx+1,idx,table["column_names_original"][acol][1],idx+1,
            #                                                                       table["column_names_original"][bcol][1])
            #         table_alias_dict[t1] = idx
            #         table_alias_dict[t2] = idx+1
            #         idx += 2
            #     else:
            #         if t1 in table_alias_dict:
            #             old_t = t1
            #             new_t = t2
            #             acol,bcol = bcol,acol
            #         elif t2 in table_alias_dict:
            #             old_t = t2
            #             new_t = t1
            #         else:
            #             ret = "{} join {} as T{} join {} as T{} on T{}.{}=T{}.{}".format(ret,table["table_names"][t1], idx,
            #                                                                           table["table_names"][t2],
            #                                                                           idx + 1, idx,
            #                                                                           table["column_names_original"][acol][1],
            #                                                                           idx + 1,
            #                                                                           table["column_names_original"][bcol][1])
            #             table_alias_dict[t1] = idx
            #             table_alias_dict[t2] = idx + 1
            #             idx += 2
            #             continue
            #         ret = "{} join {} as T{} on T{}.{}=T{}.{}".format(ret,new_t,idx,idx,table["column_names_original"][acol][1],
            #                                                        table_alias_dict[old_t],table["column_names_original"][bcol][1])
            #         table_alias_dict[new_t] = idx
            #         idx += 1
        # visited = set()
        candidate_tables = list(candidate_tables)
        start = candidate_tables[0]
        table_alias_dict[start] = idx
        idx += 1
        ret = "from {} as T1".format(table["table_names_original"][start])
        try:
            for end in candidate_tables[1:]:
                if end in table_alias_dict:
                    continue
                path = self.find_shortest_path(start, end, graph)
                prev_table = start
                if not path:
                    table_alias_dict[end] = idx
                    idx += 1
                    ret = "{} join {} as T{}".format(
                        ret,
                        table["table_names_original"][end],
                        table_alias_dict[end],
                    )
                    continue
                for node, (acol, bcol) in path:
                    if node in table_alias_dict:
                        prev_table = node
                        continue
                    table_alias_dict[node] = idx
                    idx += 1
                    ret = "{} join {} as T{} on T{}.{} = T{}.{}".format(
                        ret, table["table_names_original"][node],
                        table_alias_dict[node], table_alias_dict[prev_table],
                        table["column_names_original"][acol][1],
                        table_alias_dict[node],
                        table["column_names_original"][bcol][1])
                    prev_table = node
        except:
            traceback.print_exc()
            print("db:{}".format(table["db_id"]))
            # print(table["db_id"])
            return table_alias_dict, ret
        # if len(candidate_tables) != len(table_alias_dict):
        #     print("error in generate from clause!!!!!")
        return table_alias_dict, ret

    def gen_sql(self, sql, table):
        select_clause = ""
        from_clause = ""
        groupby_clause = ""
        orderby_clause = ""
        having_clause = ""
        where_clause = ""
        nested_clause = ""
        cols = {}
        candidate_tables = set()
        nested_sql = {}
        nested_label = ""
        parent_sql = sql
        # if "sql" in sql:
        #     sql = sql["sql"]
        if "nested_label" in sql:
            nested_label = sql["nested_label"]
            nested_sql = sql["nested_sql"]
            sql = sql["sql"]
        elif "sql" in sql:
            sql = sql["sql"]
        for key in sql:
            if key not in KW_WITH_COL:
                continue
            for item in sql[key]:
                if isinstance(item, tuple) and len(item) == 3:
                    if table["column_names"][item[2]][0] != -1:
                        candidate_tables.add(table["column_names"][item[2]][0])
        table_alias_dict, from_clause = self.gen_from(candidate_tables, table)
        ret = []
        if "select" in sql:
            select_clause = self.gen_select(sql["select"], "select", table,
                                            table_alias_dict)
            if len(select_clause) > 0:
                ret.append(select_clause)
            else:
                print("select not found:{}".format(parent_sql))
        else:
            print("select not found:{}".format(parent_sql))
        if len(from_clause) > 0:
            ret.append(from_clause)
        if "where" in sql:
            where_clause = self.gen_where(sql["where"], table,
                                          table_alias_dict)
            if len(where_clause) > 0:
                ret.append(where_clause)
        if "groupBy" in sql:  ## DEBUG-ed order
            groupby_clause = self.gen_group_by(sql["groupBy"], "group by",
                                               table, table_alias_dict)
            if len(groupby_clause) > 0:
                ret.append(groupby_clause)
        if "orderBy" in sql:
            orderby_clause = self.gen_orderby(sql["orderBy"], table,
                                              table_alias_dict)
            if len(orderby_clause) > 0:
                ret.append(orderby_clause)
        if "having" in sql:
            having_clause = self.gen_having(sql["having"], table,
                                            table_alias_dict)
            if len(having_clause) > 0:
                ret.append(having_clause)
        if len(nested_label) > 0:
            nested_clause = "{} {}".format(nested_label,
                                           self.gen_sql(nested_sql, table))
            if len(nested_clause) > 0:
                ret.append(nested_clause)
        return " ".join(ret)

    def check_acc(self, pred_sql, gt_sql):
        pass
Example #16
0
    def __init__(self,
                 word_emb,
                 N_word,
                 N_h=100,
                 N_depth=2,
                 gpu=False,
                 use_ca=True,
                 trainable_emb=False):
        super(SQLNet, self).__init__()
        self.use_ca = use_ca
        self.trainable_emb = trainable_emb

        self.gpu = gpu
        self.N_h = N_h
        self.N_depth = N_depth

        self.max_col_num = 45
        self.max_tok_num = 200
        self.SQL_TOK = [
            '<UNK>', '<END>', 'WHERE', 'AND', 'OR', '==', '>', '<', '!=',
            '<BEG>'
        ]
        self.COND_OPS = ['>', '<', '==', '!=']

        # 词向量,可选择自己训练或者使用训练好的词向量,这里选用加载好的词向量
        self.embed_layer = WordEmbedding(word_emb,
                                         N_word,
                                         gpu,
                                         self.SQL_TOK,
                                         our_model=True,
                                         trainable=trainable_emb)

        # 预测列数目
        self.sel_num = SelNumPredictor(N_word, N_h, N_depth, use_ca=use_ca)

        # 预测那个列被选中了
        self.sel_pred = SelPredictor(N_word,
                                     N_h,
                                     N_depth,
                                     self.max_tok_num,
                                     use_ca=use_ca)

        # 预测相应选定列的聚合函数
        self.agg_pred = AggPredictor(N_word, N_h, N_depth, use_ca=use_ca)

        # 预测条件数、条件列、条件操作和条件值
        self.cond_pred = SQLNetCondPredictor(N_word, N_h, N_depth,
                                             self.max_col_num,
                                             self.max_tok_num, use_ca, gpu)

        # 预测条件关系,如“and”、“or”
        self.where_rela_pred = WhereRelationPredictor(N_word,
                                                      N_h,
                                                      N_depth,
                                                      use_ca=use_ca)

        self.CE = nn.CrossEntropyLoss()  #交叉熵损失函数
        self.softmax = nn.Softmax(dim=-1)
        self.log_softmax = nn.LogSoftmax()
        self.bce_logit = nn.BCEWithLogitsLoss()
        if gpu:
            self.cuda()
Example #17
0
import pandas as pd
from word_embedding import WordEmbedding
from solution.clean import CleanText

ct = CleanText()
data = pd.read_csv("data/Categorie_original.zip",sep=";").fillna("")
ct.clean_df_column(data, "Description", "Description_cleaned")
array_token = [line.split(" ") for line in data["Description_cleaned"].values]
print(len(array_token))

features_dimension = 300
min_count = 1
window = 5
hs = 0
negative = 10

we_sg = WordEmbedding(word_embedding_type = "word2vec",
                      args = dict(sentences = array_token, sg=1, hs=hs, negative=negative, min_count=min_count, size=features_dimension, window = window, iter=15))
model_sg, training_time_sg = we_sg.train()
print("Model Skip-gram trained in %.2f minutes"%(training_time_sg/60))


we_cbow = WordEmbedding(word_embedding_type = "word2vec",
                      args = dict(sentences = array_token, sg=0, hs=hs, negative=negative, min_count=min_count, size=features_dimension, window = window, iter=15))
model_cbow, training_time_cbow = we_cbow.train()
print("Model CBOW trained in %.2f minutes"%(training_time_cbow/60))

model_sg.save("data/full_model_sg")
model_cbow.save("data/full_model_cbow")
Example #18
0
class Seq2SQL(nn.Module):
    def __init__(self,
                 word_emb,
                 num_words,
                 num_hidden=100,
                 num_layers=2,
                 use_gpu=True):
        super(Seq2SQL, self).__init__()

        self.word_emb = word_emb
        self.num_words = num_words
        self.num_hidden = num_hidden
        self.num_layers = num_layers
        self.use_gpu = use_gpu

        self.max_col_num = 45
        self.max_tok_num = 200
        self.COND_OPS = ['EQL', 'GT', 'LT']
        self.SQL_TOK = ['<UNK>', '<BEG>', '<END>', 'WHERE', 'AND'
                        ] + self.COND_OPS

        # GloVe Word Embedding
        self.embed_layer = WordEmbedding(word_emb, num_words, self.SQL_TOK,
                                         use_gpu)

        # Aggregation Classifier
        self.agg_classifier = AggregationClassifier(num_words, num_hidden,
                                                    num_layers)

        # SELECT Column(s)
        self.sel_classifier = SelectClassifier(num_words, num_hidden,
                                               num_layers, self.max_tok_num)

        # WHERE Clause
        self.whr_classifier = WhereClassifier(num_words, num_hidden,
                                              num_layers, self.max_col_num,
                                              self.max_tok_num, use_gpu)

        # run on GPU
        if use_gpu:
            self.cuda()

    def generate_g_s(self, q, col, query):
        # data format
        # <BEG> WHERE cond1_col cond1_op cond1
        #         AND cond2_col cond2_op cond2
        #         AND ... <END>

        ret_seq = []
        for cur_q, cur_col, cur_query in zip(q, col, query):
            connect_col = [
                tok for col_tok in cur_col for tok in col_tok + [',']
            ]
            all_toks = self.SQL_TOK + connect_col + [None] + cur_q + [None]
            cur_seq = [all_toks.index('<BEG>')]
            if 'WHERE' in cur_query:
                cur_where_query = cur_query[cur_query.index('WHERE'):]
                cur_seq = cur_seq + map(
                    lambda tok: all_toks.index(tok)
                    if tok in all_toks else 0, cur_where_query)
            cur_seq.append(all_toks.index('<END>'))
            ret_seq.append(cur_seq)
        return ret_seq

    def forward(self,
                q,
                col,
                col_num,
                classif_flag,
                g_s=None,
                reinforce=False):

        agg_classif, sel_classif, whr_classif = classif_flag
        agg_score, sel_score, whr_score = None, None, None

        x_emb_var, x_len = self.embed_layer.gen_x_batch(q, col)

        if agg_classif:
            agg_score = self.agg_classifier(x_emb_var, x_len)

        if sel_classif:
            col_inp_var, col_name_len, col_len = self.embed_layer.gen_col_batch(
                col)
            sel_score = self.sel_classifier(x_emb_var, x_len, col_inp_var,
                                            col_name_len, col_len, col_num)

        if whr_classif:
            whr_score = self.whr_classifier(x_emb_var,
                                            x_len,
                                            g_s,
                                            reinforce=reinforce)

        return (agg_score, sel_score, whr_score)

    def loss(self, score, ref_score, classif_flag, g_s):
        agg_classif, sel_classif, whr_classif = classif_flag
        agg_score, sel_score, whr_score = score
        loss = 0
        if agg_classif:
            agg_ref = torch.from_numpy(np.array(map(lambda x: x[0],
                                                    ref_score)))
            agg_ref_var = Variable(agg_ref)
            if self.use_gpu:
                agg_ref_var = agg_ref_var.cuda()
            loss += nn.CrossEntropyLoss()(agg_score, agg_ref_var)

        if sel_classif:
            sel_ref = torch.from_numpy(np.array(map(lambda x: x[1],
                                                    ref_score)))
            sel_ref_var = Variable(sel_ref)
            if self.use_gpu:
                sel_ref_var = sel_ref_var.cuda()
            loss += nn.CrossEntropyLoss()(sel_score, sel_ref_var)

        if whr_classif:
            g_s_len = len(g_s)
            for s, g_s_i in enumerate(g_s):
                whr_ref_var = Variable(torch.from_numpy(np.array(g_s_i[1:])))
                if self.use_gpu:
                    whr_ref_var = whr_ref_var.cuda()
                loss += (nn.CrossEntropyLoss()(whr_score[s, :len(g_s_i) - 1],
                                               whr_ref_var) / g_s_len)

        return loss

    def reinforce_backward(self, score, rewards):
        agg_score, sel_score, whr_score = score

        cur_reward = rewards[:]
        eof = self.SQL_TOK.index('<END>')
        for whr_score_t in whr_score[1]:
            reward_inp = torch.FloatTensor(cur_reward).unsqueeze(1)
            if self.use_gpu:
                reward_inp = reward_inp.cuda()
            whr_score_t.reinforce(reward_inp)

            for b, _ in enumerate(rewards):
                if whr_score_t[b].data.cpu().numpy()[0] == eof:
                    cur_reward[b] = 0
        torch.autograd.backward(whr_score[1], [None for _ in whr_score[1]])
        return

    def check_acc(self, classif_queries, g_s_queries, classif_flag):

        agg_classif, sel_classif, whr_classif = classif_flag
        tot_err = agg_err = sel_err = whr_err = whr_num_err = whr_col_err = whr_op_err = whr_val_err = 0.0
        for classif_qry, g_s_qry in zip(classif_queries, g_s_queries):

            agg_err_inc = 1 if agg_classif and classif_qry['agg'] != g_s_qry[
                'agg'] else 0
            agg_err += agg_err_inc

            sel_err_inc = 1 if sel_classif and classif_qry['sel'] != g_s_qry[
                'sel'] else 0
            sel_err += sel_err_inc

            if whr_classif:
                flag = True
                whr_classifier = classif_qry['conds']
                whr_g_s = g_s_qry['conds']
                if len(whr_classifier) != len(whr_g_s):
                    flag = False
                    whr_num_err += 1
                elif set(x[0]
                         for x in whr_classifier) != set(x[0]
                                                         for x in whr_g_s):
                    flag = False
                    whr_col_err += 1
                if flag:
                    for whr_class_i in whr_classifier:
                        g_s_idx = tuple(x[0]
                                        for x in whr_g_s).index(whr_class_i[0])
                        if flag and whr_g_s[g_s_idx][1] != whr_class_i[1]:
                            flag = False
                            whr_op_err += 1
                            break
                if flag:
                    for whr_class_i in whr_classifier:
                        g_s_idx = tuple(x[0]
                                        for x in whr_g_s).index(whr_class_i[0])
                        if flag and unicode(whr_g_s[g_s_idx][2]).lower() != \
                        unicode(whr_class_i[2]).lower():
                            flag = False
                            whr_val_err += 1
                            break

                if not flag:
                    whr_err += 1

            if agg_err_inc > 0 or sel_err_inc > 0 or not flag:
                tot_err += 1

        return np.array((agg_err, sel_err, whr_err)), tot_err

    def gen_query(self,
                  score,
                  q,
                  col,
                  raw_q,
                  raw_col,
                  classif_flag,
                  reinforce=False,
                  verbose=False):
        def merge_tokens(tok_list, raw_tok_str):
            tok_str = raw_tok_str.lower()
            special = {
                '-LRB-': '(',
                '-RRB-': ')',
                '-LSB-': '[',
                '-RSB-': ']',
                '``': '"',
                '\'\'': '"',
                '--': u'\u2013'
            }
            ret = ''
            double_quote_pair_track = 0
            for raw_tok in tok_list:
                if not raw_tok:
                    continue
                tok = special.get(raw_tok, raw_tok)
                if tok == '"':
                    double_quote_pair_track = 1 - double_quote_pair_track
                    if double_quote_pair_track:
                        ret = ret + ' '
                if len(ret) == 0:
                    pass
                elif len(ret) > 0 and ret + ' ' + tok in tok_str:
                    ret = ret + ' '
                elif len(ret) > 0 and ret + tok in tok_str:
                    pass
                elif (tok[0] not in string.ascii_lowercase) and (
                        tok[0] not in string.digits) and (tok[0] not in '$('):
                    pass
                elif (ret[-1] not in ['(', '/', u'\u2013', '#', '$', '&']) and \
                     (ret[-1] != '"' or not double_quote_pair_track):
                    ret = ret + ' '
                ret = ret + tok
            return ret.strip()

        agg_classif, sel_classif, whr_classif = classif_flag
        agg_score, sel_score, whr_score = score

        ret_queries = []
        batch_len = len(agg_score) if agg_classif else len(
            sel_score) if sel_classif else len(
                whr_score[0]) if reinforce else len(whr_score)
        for b in range(batch_len):
            cur_query = {}
            if agg_classif:
                cur_query['agg'] = np.argmax(agg_score[b].data.cpu().numpy())
            if sel_classif:
                cur_query['sel'] = np.argmax(sel_score[b].data.cpu().numpy())
            if whr_classif:
                cur_query['conds'] = []
                all_toks = self.SQL_TOK + [
                    x for toks in col[b] for x in toks + [',']
                ] + [''] + q[b] + ['']
                whr_toks = []
                if reinforce:
                    for choices in whr_score[1]:
                        if choices[b].data.cpu().numpy()[0] < len(all_toks):
                            whr_val = all_toks[choices[b].data.cpu().numpy()
                                               [0]]
                        else:
                            whr_val = '<UNK>'
                        if whr_val == '<END>':
                            break
                        whr_toks.append(whr_val)
                else:
                    for where_score in whr_score[b].data.cpu().numpy():
                        whr_tok = np.argmax(where_score)
                        whr_val = all_toks[whr_tok]
                        if whr_val == '<END>':
                            break
                        whr_toks.append(whr_val)

                if verbose:
                    print whr_toks
                if len(whr_toks) > 0:
                    whr_toks = whr_toks[1:]
                st = 0
                while st < len(whr_toks):
                    cur_cond = [None, None, None]
                    ed = len(whr_toks) if 'AND' not in whr_toks[st:] \
                         else whr_toks[st:].index('AND') + st
                    if 'EQL' in whr_toks[st:ed]:
                        op = whr_toks[st:ed].index('EQL') + st
                        cur_cond[1] = 0
                    elif 'GT' in whr_toks[st:ed]:
                        op = whr_toks[st:ed].index('GT') + st
                        cur_cond[1] = 1
                    elif 'LT' in whr_toks[st:ed]:
                        op = whr_toks[st:ed].index('LT') + st
                        cur_cond[1] = 2
                    else:
                        op = st
                        cur_cond[1] = 0
                    sel_col = whr_toks[st:op]
                    to_idx = [x.lower() for x in raw_col[b]]
                    classif_col = merge_tokens(sel_col, raw_q[b] + ' || ' + \
                                            ' || '.join(raw_col[b]))
                    if classif_col in to_idx:
                        cur_cond[0] = to_idx.index(classif_col)
                    else:
                        cur_cond[0] = 0
                    cur_cond[2] = merge_tokens(whr_toks[op + 1:ed], raw_q[b])
                    cur_query['conds'].append(cur_cond)
                    st = ed + 1
            ret_queries.append(cur_query)

        return ret_queries
Example #19
0
    elif args.train_component == "keyword":
        model = KeyWordPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "col":
        model = ColPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "op":
        model = OpPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "agg":
        model = AggPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "root_tem":
        model = RootTeminalPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "des_asc":
        model = DesAscLimitPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "having":
        model = HavingPredictor(N_word=N_word,N_h=N_h,N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "andor":
        model = AndOrPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    elif args.train_component == "from":
        model = FromPredictor(N_word=N_word, N_h=N_h, N_depth=N_depth, gpu=GPU, use_hs=use_hs, bert=bert)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0)
    if BERT:
        optimizer_bert = torch.optim.Adam(bert_model.parameters(), lr=bert_learning_rate)
    else:
        optimizer_bert = None
    print("finished build model")

    print_flag = False
    model.load_state_dict(torch.load(args.load_path))
    embed_layer = WordEmbedding(word_emb, N_word, gpu=GPU, SQL_TOK=SQL_TOK, use_bert=BERT, trainable=False)
    acc = epoch_acc(model, BATCH_SIZE, args.train_component, embed_layer, dev_data, table_type=args.table_type)
    print("finished: {}".format(time.time() - start_time))
Example #20
0
def bolukbasi_debias_original(embedding,
                              word_pairs,
                              out_file,
                              excludes=None,
                              mirrors=None,
                              **kwargs):
    # type: (WordEmbedding, Iterable[Tuple[str, str]], Path, Iterable[str], Iterable[Tuple[str, str]], **Any) -> WordEmbedding
    """Debias a word embedding using Bolukbasi's original algorithm.

    Adapted from https://github.com/tolga-b/debiaswe/blob/master/debiaswe/debias.py#L19
    Commit 10277b23e187ee4bd2b6872b507163ef4198686b on 2018-04-02

    Parameters:
        embedding (WordEmbedding): The word embedding to debias.
        word_pairs (Iterable[Tuple[str, str]]):
            A list of word pairs that define the bias subspace.
        out_file (Path):
            The path to save the new embedding to.
        excludes (Iterable[str]):
            A collection of words to be excluded from the debiasing
        mirrors (Iterable[Tuple[str, str]]):
            Specific words that should be equidistant.
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The debiased word embedding.
    """
    if out_file.exists():
        return WordEmbedding.load_word2vec_file(out_file)

    # define the bias subspace

    # recenter words
    matrix = []
    for male_word, female_word in word_pairs:
        if male_word not in embedding or female_word not in embedding:
            continue
        matrix.extend(
            recenter(np.array([embedding[male_word], embedding[female_word]])))

    bias_subspace = define_bias_subspace(matrix, **kwargs)
    bias_subspace = _align_gender_direction(embedding, bias_subspace,
                                            word_pairs)
    bias_subspace = bias_subspace[np.newaxis, :]

    # debias by rejecting the subspace and reverting the excluded words
    if excludes is None:
        excludes = set()
    new_vectors = reject(embedding.vectors, bias_subspace)
    for word in excludes:
        if word in embedding:
            new_vectors[embedding.index(word)] = embedding[word]
    new_vectors = normalize(new_vectors)

    # FIXME does equalizing make sense in higher dimensions?
    #new_vectors = _bolukbasi_equalize(embedding, new_vectors, bias_subspace, mirrors)

    # create a word embedding from the new vectors
    new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors)
    new_embedding.source = out_file
    new_embedding.save()
    return new_embedding
Example #21
0
def make_dataset(use_full_dataset=True):
    # Make txt dataset
    txt_dataset = _make_text_dataset(use_full_dataset=use_full_dataset)

    # Load full stanford embedding from file
    stanford = pickle.load(open(constants.GLOVE_EMBEDDING_STANFORD_PATH, "rb"))

    # Create vocabulary, cut at top 20k words
    word_index = _make_capped_word_index(stanford, txt_dataset)

    # Reduce embedding matrix to include top 20k words
    embedding_vectors = _make_embeddings(stanford, word_index)

    # Create ID dataset, with <pad>'s and <unk>'s
    id_dataset = make_id_dataset(txt_dataset, word_index)

    print("Creating and saving indices...")
    # Make ordering indices for shuffling data
    N = len(id_dataset["train_tweets"])
    index = np.arange(N)
    np.random.seed(constants.SEED)
    np.random.shuffle(index)

    # Divide indices into train and test indices
    divider = int(constants.SPLIT_RATIO * N)
    train_index = index[:divider]
    test_index = index[divider:]
    data_index = {"train_index": train_index, "test_index": test_index}

    # Save data index
    index_path = constants.DATA_INDEX_SMALL_PATH
    if use_full_dataset: index_path = constants.DATA_INDEX_FULL_PATH
    pickle.dump(data_index, open(index_path, "wb"))

    # Save to pickle
    print("Saving word embeddings...")
    word_embedding_20k = WordEmbedding(embedding_vectors, word_index)
    pickle.dump(word_embedding_20k,
                open(constants.STANFORD_20K_EMBEDDING_PATH, "wb"))

    print("Saving txt dataset...")
    txt_dataset_path = constants.TXT_DATASET_SMALL_PATH
    if use_full_dataset: txt_dataset_path = constants.TXT_DATASET_FULL_PATH
    pickle.dump(txt_dataset, open(txt_dataset_path, "wb"))

    print("Saving id dataset...")
    id_dataset_path = constants.ID_DATASET_SMALL_PATH
    if use_full_dataset: id_dataset_path = constants.ID_DATASET_FULL_PATH
    pickle.dump(id_dataset, open(id_dataset_path, "wb"))

    # Plot tweet length distribution
    tweets_lengths = np.array(
        [len(tweet) for tweet in txt_dataset["train_tweets"]])
    plt.hist(tweets_lengths, bins=50, edgecolor="black")
    plt.xlabel("Tweet length")
    plt.ylabel("Frequency")
    plt.savefig(constants.PLOTS_DIR + "tweet_lengths.eps",
                format="eps",
                dpi=1000,
                bbox_inches="tight")

    # Print tweet fraction with length <= 40 words
    frac_max_40_words = len(
        tweets_lengths[tweets_lengths <= 40]) / len(tweets_lengths)
    print("Fraction of tweets with length <= 40 words:", frac_max_40_words)
Example #22
0
        return len(self.words)

    def dim(self):
        return self.embedding.dim

    def get_chunk(self):
        chunk_words = []
        got_good_words = False
        while not got_good_words:
            sta_ind = np.random.randint(0, len(self) - self.chunk_size - 1)
            end_ind = sta_ind + self.chunk_size
            chunk_words = self.words[sta_ind:end_ind]
            got_good_words = all([word in embedding for word in chunk_words])
        vec_chunk = np.stack([self.embedding[word] for word in chunk_words])
        return torch.from_numpy(vec_chunk)

    def get_chunks(self, n_chunks):
        return torch.stack([self.get_chunk() for _ in range(n_chunks)])


if __name__ == '__main__':
    embedding_fn = '/Users/bkeating/nltk_data/embeddings/glove/glove.6B.100d.txt'
    embedding = WordEmbedding(embedding_fn)
    dataset = WordDataset('data/keywell_corpus.txt', embedding)

    chunk = dataset.get_chunk()
    print(chunk.size())

    chunks = dataset.get_chunks(20)
    print(chunks.size())
Example #23
0
class SQLNet(nn.Module):
    def __init__(self,
                 word_emb,
                 N_word,
                 N_h=100,
                 N_depth=2,
                 gpu=False,
                 use_ca=True,
                 trainable_emb=False):
        super(SQLNet, self).__init__()
        self.use_ca = use_ca
        self.trainable_emb = trainable_emb

        self.gpu = gpu
        self.N_h = N_h
        self.N_depth = N_depth

        self.max_col_num = 45
        self.max_tok_num = 200
        self.SQL_TOK = [
            '<UNK>', '<END>', 'WHERE', 'AND', 'OR', '==', '>', '<', '!=',
            '<BEG>'
        ]
        self.COND_OPS = ['>', '<', '==', '!=']

        # 词向量,可选择自己训练或者使用训练好的词向量,这里选用加载好的词向量
        self.embed_layer = WordEmbedding(word_emb,
                                         N_word,
                                         gpu,
                                         self.SQL_TOK,
                                         our_model=True,
                                         trainable=trainable_emb)

        # 预测列数目
        self.sel_num = SelNumPredictor(N_word, N_h, N_depth, use_ca=use_ca)

        # 预测那个列被选中了
        self.sel_pred = SelPredictor(N_word,
                                     N_h,
                                     N_depth,
                                     self.max_tok_num,
                                     use_ca=use_ca)

        # 预测相应选定列的聚合函数
        self.agg_pred = AggPredictor(N_word, N_h, N_depth, use_ca=use_ca)

        # 预测条件数、条件列、条件操作和条件值
        self.cond_pred = SQLNetCondPredictor(N_word, N_h, N_depth,
                                             self.max_col_num,
                                             self.max_tok_num, use_ca, gpu)

        # 预测条件关系,如“and”、“or”
        self.where_rela_pred = WhereRelationPredictor(N_word,
                                                      N_h,
                                                      N_depth,
                                                      use_ca=use_ca)

        self.CE = nn.CrossEntropyLoss()  #交叉熵损失函数
        self.softmax = nn.Softmax(dim=-1)
        self.log_softmax = nn.LogSoftmax()
        self.bce_logit = nn.BCEWithLogitsLoss()
        if gpu:
            self.cuda()

    # q:问题,gt_cond_seq:三元组  目的:要选择那一列
    def generate_gt_where_seq_test(self, q, gt_cond_seq):
        ret_seq = []
        for cur_q, ans in zip(q, gt_cond_seq):
            temp_q = u"".join(cur_q)
            cur_q = [u'<BEG>'] + cur_q + [u'<END>']  # 在每个问题前加<BEG>和结尾加<END>
            record = []  #如果条件值在问题中,标记(TRUE,条件值)
            record_cond = []
            for cond in ans:
                if cond[2] not in temp_q:
                    record.append((False, cond[2]))
                else:
                    record.append((True, cond[2]))
            for idx, item in enumerate(record):
                temp_ret_seq = []
                if item[0]:
                    temp_ret_seq.append(0)
                    temp_ret_seq.extend(
                        list(
                            range(
                                temp_q.index(item[1]) + 1,
                                temp_q.index(item[1]) + len(item[1]) +
                                1)))  #获取条件值的索引
                    temp_ret_seq.append(len(cur_q) - 1)
                else:
                    temp_ret_seq.append([0, len(cur_q) - 1])
                    record_cond.append(temp_ret_seq)
            ret_seq.append(record_cond)
        return ret_seq
        #q:问题,col:表头名字,col_num:有几个表头列,gt_where:conds中条件值不出现在问题中,gt_conds:conds,gt_sel:选择那列,gt_sel_num:选择几列
    def forward(self,
                q,
                col,
                col_num,
                gt_where=None,
                gt_cond=None,
                reinforce=False,
                gt_sel=None,
                gt_sel_num=None):
        B = len(q)  #batch_size的大小

        sel_num_score = None
        agg_score = None
        sel_score = None
        cond_score = None
        #预测聚合函数
        if self.trainable_emb:
            x_emb_var, x_len = self.agg_embed_layer.gen_x_batch(q, col)
            col_inp_var, col_name_len, col_len = self.agg_embed_layer.gen_col_batch(
                col)
            max_x_len = max(x_len)
            agg_score = self.agg_pred(x_emb_var,
                                      x_len,
                                      col_inp_var,
                                      col_name_len,
                                      col_len,
                                      col_num,
                                      gt_sel=gt_sel)

            x_emb_var, x_len = self.sel_embed_layer.gen_x_batch(q, col)
            col_inp_var, col_name_len, col_len = self.sel_embed_layer.gen_col_batch(
                col)
            max_x_len = max(x_len)
            sel_score = self.sel_pred(x_emb_var, x_len, col_inp_var,
                                      col_name_len, col_len, col_num)

            x_emb_var, x_len = self.cond_embed_layer.gen_x_batch(q, col)
            col_inp_var, col_name_len, col_len = self.cond_embed_layer.gen_col_batch(
                col)
            max_x_len = max(x_len)
            cond_score = self.cond_pred(x_emb_var,
                                        x_len,
                                        col_inp_var,
                                        col_name_len,
                                        col_len,
                                        col_num,
                                        gt_where,
                                        gt_cond,
                                        reinforce=reinforce)
            where_rela_score = None
        else:
            x_emb_var, x_len = self.embed_layer.gen_x_batch(
                q, col
            )  #x_len:batch中每个问题的长度,[x_emb_var:batch_size,max_seq_len,word_embedding_size]
            col_inp_var, col_name_len, col_len = self.embed_layer.gen_col_batch(
                col)  #列名向量化,长度,几个列
            sel_num_score = self.sel_num(
                x_emb_var, x_len, col_inp_var, col_name_len, col_len,
                col_num)  #[16,4]对问题的编码经过lstm,linear,softmax之后乘以编码
            # x_emb_var: embedding of each question
            # x_len: length of each question
            # col_inp_var: embedding of each header
            # col_name_len: length of each header
            # col_len: number of headers in each table, array type
            # col_num: number of headers in each table, list type
            if gt_sel_num:
                pr_sel_num = gt_sel_num
            else:
                pr_sel_num = np.argmax(sel_num_score.data.cpu().numpy(),
                                       axis=1)
            sel_score = self.sel_pred(x_emb_var, x_len, col_inp_var,
                                      col_name_len, col_len, col_num)  #【16,19】

            if gt_sel:
                pr_sel = gt_sel
            else:
                num = np.argmax(sel_num_score.data.cpu().numpy(), axis=1)
                sel = sel_score.data.cpu().numpy()
                pr_sel = [
                    list(np.argsort(-sel[b])[:num[b]]) for b in range(len(num))
                ]
            agg_score = self.agg_pred(x_emb_var,
                                      x_len,
                                      col_inp_var,
                                      col_name_len,
                                      col_len,
                                      col_num,
                                      gt_sel=pr_sel,
                                      gt_sel_num=pr_sel_num)  #【16,4,6】

            where_rela_score = self.where_rela_pred(x_emb_var, x_len,
                                                    col_inp_var, col_name_len,
                                                    col_len, col_num)  #【16,3】

            cond_score = self.cond_pred(x_emb_var,
                                        x_len,
                                        col_inp_var,
                                        col_name_len,
                                        col_len,
                                        col_num,
                                        gt_where,
                                        gt_cond,
                                        reinforce=reinforce)  #4=>[16,5]

        return (sel_num_score, sel_score, agg_score, cond_score,
                where_rela_score)

    def loss(self, score, truth_num, gt_where):
        sel_num_score, sel_score, agg_score, cond_score, where_rela_score = score

        B = len(truth_num)
        loss = 0

        # Evaluate select number
        sel_num_truth = list(map(lambda x: x[0], truth_num))  #聚合函数个数
        sel_num_truth = torch.from_numpy(
            np.array(sel_num_truth)).long()  #.astype(float))
        if self.gpu:
            sel_num_truth = Variable(sel_num_truth.cuda())
        else:
            sel_num_truth = Variable(sel_num_truth)
        #选择几个列的损失
        loss += self.CE(sel_num_score, sel_num_truth)

        # Evaluate select column选择哪个列的损失
        T = len(sel_score[0])
        truth_prob = np.zeros((B, T), dtype=np.float32)
        for b in range(B):
            truth_prob[b][list(truth_num[b][1])] = 1
        data = torch.from_numpy(truth_prob)
        if self.gpu:
            sel_col_truth_var = Variable(data.cuda())
        else:
            sel_col_truth_var = Variable(data)
        sigm = nn.Sigmoid()
        sel_col_prob = sigm(sel_score)
        bce_loss = -torch.mean(
            3 * (sel_col_truth_var * torch.log(sel_col_prob + 1e-10)) +
            (1 - sel_col_truth_var) * torch.log(1 - sel_col_prob + 1e-10)
        )  #这儿采用bceloss:-w*[y*log(x)+(1-y)*log(1-x)]
        loss += bce_loss

        # Evaluate select aggregation选择聚合函数的损失交叉熵
        for b in range(len(truth_num)):
            data = torch.from_numpy(np.array(truth_num[b][2]))  #真实的聚合函数
            if self.gpu:
                sel_agg_truth_var = Variable(data.cuda())
            else:
                sel_agg_truth_var = Variable(data.long())
            sel_agg_pred = agg_score[b, :len(truth_num[b][1])]  #聚合函数共六种
            loss += (self.CE(sel_agg_pred, sel_agg_truth_var)) / len(truth_num)

        cond_num_score, cond_col_score, cond_op_score, cond_str_score = cond_score

        # Evaluate the number of conditions预测多少个conds的损失交叉熵
        cond_num_truth = list(map(lambda x: x[3], truth_num))
        data = torch.from_numpy(np.array(cond_num_truth).astype(float)).long()
        if self.gpu:
            try:
                cond_num_truth_var = Variable(data.cuda())
            except:
                print("cond_num_truth_var error")
                print(data)
                exit(0)
        else:
            cond_num_truth_var = Variable(data)
        loss += self.CE(cond_num_score, cond_num_truth_var)

        # Evaluate the columns of conditions评估条件列
        T = len(cond_col_score[0])
        truth_prob = np.zeros((B, T), dtype=np.float32)
        for b in range(B):
            if len(truth_num[b][4]) > 0:
                truth_prob[b][list(truth_num[b][4])] = 1  #条件列
        data = torch.from_numpy(truth_prob)
        if self.gpu:
            cond_col_truth_var = Variable(data.cuda())
        else:
            cond_col_truth_var = Variable(data)

        sigm = nn.Sigmoid()
        cond_col_prob = sigm(cond_col_score)
        bce_loss = -torch.mean(
            3 * (cond_col_truth_var * torch.log(cond_col_prob + 1e-10)) +
            (1 - cond_col_truth_var) * torch.log(1 - cond_col_prob + 1e-10))
        loss += bce_loss

        # Evaluate the operator of conditions评估操作条件
        for b in range(len(truth_num)):
            if len(truth_num[b][5]) == 0:  #条件类型
                continue
            data = torch.from_numpy(np.array(truth_num[b][5])).long()
            if self.gpu:
                cond_op_truth_var = Variable(data.cuda())
            else:
                cond_op_truth_var = Variable(data)
            cond_op_pred = cond_op_score[b, :len(truth_num[b][5])]
            # try:
            loss += (self.CE(cond_op_pred, cond_op_truth_var) / len(truth_num))
            # except:
            #     print(cond_op_pred)
            #     print(cond_op_truth_var)
            #     exit(0)

        #Evaluate the strings of conditions评估条件串
        for b in range(len(gt_where)):
            for idx in range(len(gt_where[b])):
                cond_str_truth = gt_where[b][idx]
                if len(cond_str_truth) == 1:
                    continue
                data = torch.from_numpy(np.array(cond_str_truth[1:])).long()
                if self.gpu:
                    cond_str_truth_var = Variable(data.cuda())
                else:
                    cond_str_truth_var = Variable(data)
                str_end = len(cond_str_truth) - 1
                cond_str_pred = cond_str_score[b, idx, :str_end]
                loss += (self.CE(cond_str_pred, cond_str_truth_var) \
                        / (len(gt_where) * len(gt_where[b])))

        # Evaluate condition relationship, and / or评估条件关系
        where_rela_truth = list(map(lambda x: x[6], truth_num))
        data = torch.from_numpy(np.array(where_rela_truth)).long()
        if self.gpu:
            try:
                where_rela_truth = Variable(data.cuda())
            except:
                print("where_rela_truth error")
                print(data)
                exit(0)
        else:
            where_rela_truth = Variable(data)
        loss += self.CE(where_rela_score, where_rela_truth)
        return loss

    def check_acc(self, vis_info, pred_queries, gt_queries):
        def gen_cond_str(conds, header):
            if len(conds) == 0:
                return 'None'
            cond_str = []
            for cond in conds:
                cond_str.append(header[cond[0]] + ' ' +
                                self.COND_OPS[cond[1]] + ' ' +
                                unicode(cond[2]).lower())
            return 'WHERE ' + ' AND '.join(cond_str)

        tot_err = sel_num_err = agg_err = sel_err = 0.0
        cond_num_err = cond_col_err = cond_op_err = cond_val_err = cond_rela_err = 0.0
        for b, (pred_qry, gt_qry) in enumerate(zip(pred_queries, gt_queries)):
            good = True
            sel_pred, agg_pred, where_rela_pred = pred_qry['sel'], pred_qry[
                'agg'], pred_qry['cond_conn_op']
            sel_gt, agg_gt, where_rela_gt = gt_qry['sel'], gt_qry[
                'agg'], gt_qry['cond_conn_op']

            if where_rela_gt != where_rela_pred:
                good = False
                cond_rela_err += 1

            if len(sel_pred) != len(sel_gt):
                good = False
                sel_num_err += 1

            pred_sel_dict = {
                k: v
                for k, v in zip(list(sel_pred), list(agg_pred))
            }
            gt_sel_dict = {k: v for k, v in zip(sel_gt, agg_gt)}
            if set(sel_pred) != set(sel_gt):
                good = False
                sel_err += 1
            agg_pred = [pred_sel_dict[x] for x in sorted(pred_sel_dict.keys())]
            agg_gt = [gt_sel_dict[x] for x in sorted(gt_sel_dict.keys())]
            if agg_pred != agg_gt:
                good = False
                agg_err += 1

            cond_pred = pred_qry['conds']
            cond_gt = gt_qry['conds']
            if len(cond_pred) != len(cond_gt):
                good = False
                cond_num_err += 1
            else:
                cond_op_pred, cond_op_gt = {}, {}
                cond_val_pred, cond_val_gt = {}, {}
                for p, g in zip(cond_pred, cond_gt):
                    cond_op_pred[p[0]] = p[1]
                    cond_val_pred[p[0]] = p[2]
                    cond_op_gt[g[0]] = g[1]
                    cond_val_gt[g[0]] = g[2]

                if set(cond_op_pred.keys()) != set(cond_op_gt.keys()):
                    cond_col_err += 1
                    good = False

                where_op_pred = [
                    cond_op_pred[x] for x in sorted(cond_op_pred.keys())
                ]
                where_op_gt = [
                    cond_op_gt[x] for x in sorted(cond_op_gt.keys())
                ]
                if where_op_pred != where_op_gt:
                    cond_op_err += 1
                    good = False

                where_val_pred = [
                    cond_val_pred[x] for x in sorted(cond_val_pred.keys())
                ]
                where_val_gt = [
                    cond_val_gt[x] for x in sorted(cond_val_gt.keys())
                ]
                if where_val_pred != where_val_gt:
                    cond_val_err += 1
                    good = False

            if not good:
                tot_err += 1

        return np.array(
            (sel_num_err, sel_err, agg_err, cond_num_err, cond_col_err,
             cond_op_err, cond_val_err, cond_rela_err)), tot_err

    def gen_query(self, score, q, col, raw_q, reinforce=False, verbose=False):
        """
        :param score:
        :param q: token-questions
        :param col: token-headers
        :param raw_q: original question sequence
        :return:
        """
        def merge_tokens(tok_list, raw_tok_str):
            tok_str = raw_tok_str  # .lower()
            alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789$('
            special = {
                '-LRB-': '(',
                '-RRB-': ')',
                '-LSB-': '[',
                '-RSB-': ']',
                '``': '"',
                '\'\'': '"',
                '--': u'\u2013'
            }
            ret = ''
            double_quote_appear = 0
            for raw_tok in tok_list:
                if not raw_tok:
                    continue
                tok = special.get(raw_tok, raw_tok)
                if tok == '"':
                    double_quote_appear = 1 - double_quote_appear
                if len(ret) == 0:
                    pass
                elif len(ret) > 0 and ret + ' ' + tok in tok_str:
                    ret = ret + ' '
                elif len(ret) > 0 and ret + tok in tok_str:
                    pass
                elif tok == '"':
                    if double_quote_appear:
                        ret = ret + ' '
                # elif tok[0] not in alphabet:
                #     pass
                elif (ret[-1] not in ['(', '/', u'\u2013', '#', '$', '&']) \
                        and (ret[-1] != '"' or not double_quote_appear):
                    ret = ret + ' '
                ret = ret + tok
            return ret.strip()

        sel_num_score, sel_score, agg_score, cond_score, where_rela_score = score
        # [64,4,6], [64,14], ..., [64,4]
        sel_num_score = sel_num_score.data.cpu().numpy()
        sel_score = sel_score.data.cpu().numpy()
        agg_score = agg_score.data.cpu().numpy()
        where_rela_score = where_rela_score.data.cpu().numpy()
        ret_queries = []
        B = len(agg_score)
        cond_num_score,cond_col_score,cond_op_score,cond_str_score =\
            [x.data.cpu().numpy() for x in cond_score]
        for b in range(B):
            cur_query = {}
            cur_query['sel'] = []
            cur_query['agg'] = []
            sel_num = np.argmax(sel_num_score[b])
            max_col_idxes = np.argsort(-sel_score[b])[:sel_num]
            # find the most-probable columns' indexes
            max_agg_idxes = np.argsort(-agg_score[b])[:sel_num]
            cur_query['sel'].extend([int(i) for i in max_col_idxes])
            cur_query['agg'].extend([i[0] for i in max_agg_idxes])
            cur_query['cond_conn_op'] = np.argmax(where_rela_score[b])
            cur_query['conds'] = []
            cond_num = np.argmax(cond_num_score[b])
            all_toks = ['<BEG>'] + q[b] + ['<END>']
            max_idxes = np.argsort(-cond_col_score[b])[:cond_num]
            for idx in range(cond_num):
                cur_cond = []
                cur_cond.append(max_idxes[idx])  # where-col
                cur_cond.append(np.argmax(cond_op_score[b][idx]))  # where-op
                cur_cond_str_toks = []
                for str_score in cond_str_score[b][idx]:
                    str_tok = np.argmax(str_score[:len(all_toks)])
                    str_val = all_toks[str_tok]
                    if str_val == '<END>':
                        break
                    cur_cond_str_toks.append(str_val)
                cur_cond.append(merge_tokens(cur_cond_str_toks, raw_q[b]))
                cur_query['conds'].append(cur_cond)
            ret_queries.append(cur_query)
        return ret_queries
Example #24
0
    eva_tfidf = Evaluation(tweets_tfidf)
    conf_matrix = eva_tfidf.build_confusion_matrix(tweets_tfidf)
    print "Confusion matrix:"
    print conf_matrix
    print "Accuracy using TF-IDF weighting algorithm: {}".format(
        eva_tfidf.accuracy())
    print "Average Precision using TF-IDF weighting algorithm: {}".format(
        eva_tfidf.average_precision())
    print "Average Recall using TF-IDF weighting algorithm: {}".format(
        eva_tfidf.average_recall())


# baca dataset
# preprocess data: buang punctuation
data = Dataset()
data.load_dataset()
data.cleanse_dataset()
data.build_dictionaries()

tweets = data.get_dataset()
tweets_rake = tweets.copy()
tweets_tfidf = tweets.copy()

emb = WordEmbedding()
emb_vec = emb.load_embedding(emb_type='fasttext-id')

asp = Aspects()

run_experiment_with_rake()
run_experiment_with_tfidf(tweets_tfidf)
Example #25
0
class LDA2Vec:
    def __init__(self,
                 num_docs,
                 vocab_size,
                 num_topics,
                 embedding_size,
                 freqs,
                 batch_size,
                 save_graph,
                 num_sampled=40):
        self.num_docs = num_docs
        self.vocab_size = vocab_size
        self.num_topics = num_topics
        self.embedding_size = embedding_size
        self.freqs = freqs
        self.batch_size = batch_size
        self.save_graph = save_graph
        self.num_sampled = num_sampled
        self.lmbda = 200.0
        self.learning_rate = 0.001
        self.moving_avgs = tf.train.ExponentialMovingAverage(0.9)
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.sesh = tf.Session(config=self.config)
        self.computed_norm = False

        self.logdir = "_".join(
            ("lda2vec", datetime.now().strftime('%y%m%d_%H%M')))

        self.w_embed = WordEmbedding(self.embedding_size,
                                     self.vocab_size,
                                     self.num_sampled,
                                     freqs=self.freqs)

        self.mixture = EmbeddingMixture(self.num_docs, self.num_topics,
                                        self.embedding_size)

        handles = self.retrieve_variables()

        (self.x, self.y, self.docs, self.step, self.switch_loss,
         self.word_context, self.doc_context, self.loss_word2vec,
         self.fraction, self.loss_lda, self.loss, self.loss_avgs_op,
         self.optimizer, self.merged) = handles

    def train(self,
              pivot_ids,
              target_ids,
              doc_ids,
              num_epochs,
              idx_to_word,
              switch_loss_epoch=5,
              save_every=1,
              report_every=1,
              print_topics_every=5):
        data_size = len(pivot_ids)

        temp_fraction = self.batch_size * 1.0 / data_size

        self.sesh.run(tf.assign(self.fraction, temp_fraction))

        iters_per_epoch = int(data_size / self.batch_size) + np.ceil(
            data_size % self.batch_size)

        switch_loss_step = iters_per_epoch * switch_loss_epoch

        self.sesh.run(tf.assign(self.switch_loss, switch_loss_step))

        if self.save_graph:

            saver = tf.train.Saver()

            writer = tf.summary.FileWriter(self.logdir + '/',
                                           graph=self.sesh.graph)

        for epoch in range(num_epochs):
            print('\nEPOCH:', epoch + 1)

            for pivot, target, doc in chunks(self.batch_size, pivot_ids,
                                             target_ids, doc_ids):

                feed_dict = {self.x: pivot, self.y: target, self.docs: doc}

                fetches = [
                    self.merged, self.optimizer, self.loss, self.loss_word2vec,
                    self.loss_lda, self.step
                ]

                summary, _, l, lw2v, llda, step = self.sesh.run(
                    fetches, feed_dict=feed_dict)

            if (epoch + 1) % report_every == 0:
                print('Loss: ', l, 'Word2Vec Loss: ', lw2v, 'LDA loss: ', llda)

            if (epoch + 1) % save_every == 0 and self.save_graph:
                writer.add_summary(summary, step)
                writer.flush()
                writer.close()
                save_path = saver.save(self.sesh, self.logdir + '/model.ckpt')
                writer = tf.summary.FileWriter(self.logdir + '/',
                                               graph=self.sesh.graph)

            if epoch > 0 and (epoch + 1) % print_topics_every == 0:
                idxs = np.arange(self.num_topics)
                words, sims = self.get_k_closest(idxs,
                                                 idx_to_word=idx_to_word,
                                                 k=10)

        if self.save_graph and (epoch + 1) % save_every != 0:
            writer.add_summary(summary, step)
            writer.flush()
            writer.close()
            save_path = saver.save(self.sesh, self.logdir + '/model.ckpt')

    def get_k_closest(self,
                      idxs,
                      in_type="topic",
                      vs_type="word",
                      k=10,
                      idx_to_word=None):
        if not self.computed_norm:
            self.normed_embed_dict = {}
            norm = tf.sqrt(
                tf.reduce_sum(self.mixture.topic_embedding**2,
                              1,
                              keep_dims=True))
            self.normed_embed_dict[
                'topic'] = self.mixture.topic_embedding / norm
            norm = tf.sqrt(
                tf.reduce_sum(self.w_embed.embedding**2, 1, keep_dims=True))
            self.normed_embed_dict['word'] = self.w_embed.embedding / norm
            norm = tf.sqrt(
                tf.reduce_sum(self.mixture.doc_embedding**2, 1,
                              keep_dims=True))
            self.normed_embed_dict['doc'] = self.mixture.doc_embedding / norm
            self.idxs_in = tf.placeholder(tf.int32, shape=[None], name='idxs')
            self.computed_norm = True

        self.batch_array = tf.nn.embedding_lookup(
            self.normed_embed_dict[in_type], self.idxs_in)
        self.cosine_similarity = tf.matmul(
            self.batch_array,
            tf.transpose(self.normed_embed_dict[vs_type], [1, 0]))
        feed_dict = {self.idxs_in: idxs}
        sim, sim_idxs = self.sesh.run(tf.nn.top_k(self.cosine_similarity, k=k),
                                      feed_dict=feed_dict)
        if idx_to_word:
            print(
                '---------Closest {} words to given indexes----------'.format(
                    k))
            for i, idx in enumerate(idxs):
                in_word = 'Topic ' + str(idx)
                vs_word_list = []
                for vs_i in range(sim_idxs[i].shape[0]):
                    vs_idx = sim_idxs[i][vs_i]
                    vs_word = idx_to_word[vs_idx]
                    vs_word_list.append(vs_word)
                    print(in_word, ':', (', ').join(vs_word_list))

        return (sim, sim_idxs)

    def retrieve_variables(self):
        x = tf.placeholder(tf.int32, shape=[None], name='x_pivot_idxs')
        y = tf.placeholder(tf.int64, shape=[None], name='y_target_idxs')
        docs = tf.placeholder(tf.int32, shape=[None], name='doc_ids')

        step = tf.Variable(0, trainable=False, name='global_step')

        switch_loss = tf.Variable(0, trainable=False)
        word_context = tf.nn.embedding_lookup(self.w_embed.embedding,
                                              x,
                                              name='word_embed_lookup')
        doc_context = self.mixture.get_context(doc_ids=docs)

        contexts_to_add = [word_context, doc_context]
        context = tf.add_n(contexts_to_add, name='context_vector')

        with tf.name_scope('nce_loss'):
            loss_word2vec = self.w_embed.compute_loss(context, y)
            tf.summary.scalar('nce_loss', loss_word2vec)

        with tf.name_scope('lda_loss'):
            fraction = tf.Variable(1,
                                   trainable=False,
                                   dtype=tf.float32,
                                   name='fraction')
            loss_lda = self.lmbda * fraction * self.prior()
            tf.summary.scalar('lda_loss', loss_lda)

        loss = tf.cond(step < switch_loss, lambda: loss_word2vec,
                       lambda: loss_word2vec + loss_lda)

        loss_avgs_op = self.moving_avgs.apply([loss_lda, loss_word2vec, loss])

        with tf.control_dependencies([loss_avgs_op]):
            optimizer = tf.contrib.layers.optimize_loss(
                loss,
                tf.train.get_global_step(),
                self.learning_rate,
                'Adam',
                name='Optimizer')

        self.sesh.run(
            tf.global_variables_initializer(),
            options=tf.RunOptions(report_tensor_allocations_upon_oom=True))

        merged = tf.summary.merge_all()

        return [
            x, y, docs, step, switch_loss, word_context, doc_context,
            loss_word2vec, fraction, loss_lda, loss, loss_avgs_op, optimizer,
            merged
        ]

    def prior(self):
        n_topics = self.mixture.doc_embedding.get_shape()[1].value
        alpha = 1.0 / n_topics
        log_proportions = tf.nn.log_softmax(self.mixture.doc_embedding)
        return tf.reduce_sum((alpha - 1.0) * log_proportions)
def train_feedback(nlq, db_name, correct_query, toy, word_emb):
    """
    Arguments:
        nlq: english question (tokenization is done here) - get from Flask (User)
        db_name: name of the database the query targets - get from Flask (User)
        correct_query: the ground truth query supplied by the user(s) - get from Flask
        toy: uses a small example of word embeddings to debug faster
    """

    ITER = 21

    SAVED_MODELS_FOLDER = "saved_models"
    OUTPUT_PATH = "output_inference.txt"
    HISTORY_TYPE = "full"
    GPU_ENABLE = False
    TRAIN_EMB = False
    TABLE_TYPE = "std"
    DATA_ROOT = "generated_data"

    use_hs = True
    if HISTORY_TYPE == "no":
        HISTORY_TYPE = "full"
        use_hs = False
    """
    Model Hyperparameters
    """
    N_word = 300  # word embedding dimension
    B_word = 42  # 42B tokens in the Glove pretrained embeddings
    N_h = 300  # hidden size dimension
    N_depth = 2  #

    if toy:
        USE_SMALL = True
        # GPU=True
        GPU = GPU_ENABLE
        BATCH_SIZE = 20
    else:
        USE_SMALL = False
        # GPU=True
        GPU = GPU_ENABLE
        BATCH_SIZE = 64
    # TRAIN_ENTRY=(False, True, False)  # (AGG, SEL, COND)
    # TRAIN_AGG, TRAIN_SEL, TRAIN_COND = TRAIN_ENTRY
    learning_rate = 1e-4

    # GENERATE CORRECT QUERY DATASET
    table_data_path = "./data/spider/tables.json"
    table_dict = get_table_dict(table_data_path)
    train_data_path = "./data/spider/train_spider.json"
    train_data = json.load(open(train_data_path))
    sql = correct_query  #"SELECT name ,  country ,  age FROM singer ORDER BY age DESC"
    db_id = db_name  #"concert_singer"
    table_file = table_data_path  # "tables.json"

    schemas, db_names, tables = get_schemas_from_json(table_file)
    schema = schemas[db_id]
    table = tables[db_id]
    schema = Schema(schema, table)
    sql_label = get_sql(schema, sql)
    correct_query_data = {
        "multi_sql_dataset": [],
        "keyword_dataset": [],
        "col_dataset": [],
        "op_dataset": [],
        "agg_dataset": [],
        "root_tem_dataset": [],
        "des_asc_dataset": [],
        "having_dataset": [],
        "andor_dataset": []
    }
    parser_item_with_long_history(
        tokenize(nlq),  #item["question_toks"], 
        sql_label,  #item["sql"], 
        table_dict[db_name],  #table_dict[item["db_id"]], 
        [],
        correct_query_data)
    # print("\nCorrect query dataset: {}".format(correct_query_data))

    for train_component in TRAIN_COMPONENTS:
        print("\nTRAIN COMPONENT: {}".format(train_component))
        # Check if the compenent to be trained is an actual component
        if train_component not in TRAIN_COMPONENTS:
            print("Invalid train component")
            exit(1)
        """
        Read in the data
        """
        train_data = load_train_dev_dataset(train_component, "train",
                                            HISTORY_TYPE, DATA_ROOT)
        # print("train_data type: {}".format(type(train_data)))
        dev_data = load_train_dev_dataset(train_component, "dev", HISTORY_TYPE,
                                          DATA_ROOT)
        # sql_data, table_data, val_sql_data, val_table_data, \
        #         test_sql_data, test_table_data, \
        #         TRAIN_DB, DEV_DB, TEST_DB = load_dataset(args.dataset, use_small=USE_SMALL)

        if GPU_ENABLE:
            map_to = "gpu"
        else:
            map_to = "cpu"

        # Selecting which Model to Train
        model = None
        if train_component == "multi_sql":
            model = MultiSqlPredictor(N_word=N_word,
                                      N_h=N_h,
                                      N_depth=N_depth,
                                      gpu=GPU,
                                      use_hs=use_hs)
            model.load_state_dict(
                torch.load(
                    "{}/multi_sql_models.dump".format(SAVED_MODELS_FOLDER),
                    map_location=map_to))

        elif train_component == "keyword":
            model = KeyWordPredictor(N_word=N_word,
                                     N_h=N_h,
                                     N_depth=N_depth,
                                     gpu=GPU,
                                     use_hs=use_hs)
            model.load_state_dict(
                torch.load(
                    "{}/keyword_models.dump".format(SAVED_MODELS_FOLDER),
                    map_location=map_to))

        elif train_component == "col":
            model = ColPredictor(N_word=N_word,
                                 N_h=N_h,
                                 N_depth=N_depth,
                                 gpu=GPU,
                                 use_hs=use_hs)
            model.load_state_dict(
                torch.load("{}/col_models.dump".format(SAVED_MODELS_FOLDER),
                           map_location=map_to))

        elif train_component == "op":
            model = OpPredictor(N_word=N_word,
                                N_h=N_h,
                                N_depth=N_depth,
                                gpu=GPU,
                                use_hs=use_hs)
            model.load_state_dict(
                torch.load("{}/op_models.dump".format(SAVED_MODELS_FOLDER),
                           map_location=map_to))

        elif train_component == "agg":
            model = AggPredictor(N_word=N_word,
                                 N_h=N_h,
                                 N_depth=N_depth,
                                 gpu=GPU,
                                 use_hs=use_hs)
            model.load_state_dict(
                torch.load("{}/agg_models.dump".format(SAVED_MODELS_FOLDER),
                           map_location=map_to))

        elif train_component == "root_tem":
            model = RootTeminalPredictor(N_word=N_word,
                                         N_h=N_h,
                                         N_depth=N_depth,
                                         gpu=GPU,
                                         use_hs=use_hs)
            model.load_state_dict(
                torch.load(
                    "{}/root_tem_models.dump".format(SAVED_MODELS_FOLDER),
                    map_location=map_to))

        elif train_component == "des_asc":
            model = DesAscLimitPredictor(N_word=N_word,
                                         N_h=N_h,
                                         N_depth=N_depth,
                                         gpu=GPU,
                                         use_hs=use_hs)
            model.load_state_dict(
                torch.load(
                    "{}/des_asc_models.dump".format(SAVED_MODELS_FOLDER),
                    map_location=map_to))

        elif train_component == "having":
            model = HavingPredictor(N_word=N_word,
                                    N_h=N_h,
                                    N_depth=N_depth,
                                    gpu=GPU,
                                    use_hs=use_hs)
            model.load_state_dict(
                torch.load("{}/having_models.dump".format(SAVED_MODELS_FOLDER),
                           map_location=map_to))

        elif train_component == "andor":
            model = AndOrPredictor(N_word=N_word,
                                   N_h=N_h,
                                   N_depth=N_depth,
                                   gpu=GPU,
                                   use_hs=use_hs)
            model.load_state_dict(
                torch.load("{}/andor_models.dump".format(SAVED_MODELS_FOLDER),
                           map_location=map_to))

        # model = SQLNet(word_emb, N_word=N_word, gpu=GPU, trainable_emb=args.train_emb)

        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
                                     weight_decay=0)
        print("finished build model")

        print_flag = False
        embed_layer = WordEmbedding(word_emb,
                                    N_word,
                                    gpu=GPU,
                                    SQL_TOK=SQL_TOK,
                                    trainable=TRAIN_EMB)

        print("start training")
        best_acc = 0.0
        for i in range(ITER):
            print('ITER %d @ %s' % (i + 1, datetime.datetime.now()))
            # arguments of epoch_train
            # model, optimizer, batch_size, component,embed_layer,data, table_type
            # print(' Loss = %s' % epoch_train(
            #                     model, optimizer, BATCH_SIZE,
            #                     args.train_component,
            #                     embed_layer,
            #                     train_data,
            #                     table_type=args.table_type))
            print('Total Loss = %s' %
                  epoch_feedback_train(model=model,
                                       optimizer=optimizer,
                                       batch_size=BATCH_SIZE,
                                       component=train_component,
                                       embed_layer=embed_layer,
                                       data=train_data,
                                       table_type=TABLE_TYPE,
                                       nlq=nlq,
                                       db_name=db_name,
                                       correct_query=correct_query,
                                       correct_query_data=correct_query_data))

            # Check improvement every 10 iterations
            if i % 10 == 0:
                acc = epoch_acc(model,
                                BATCH_SIZE,
                                train_component,
                                embed_layer,
                                dev_data,
                                table_type=TABLE_TYPE)
                if acc > best_acc:
                    best_acc = acc
                    print("Save model...")
                    torch.save(
                        model.state_dict(), SAVED_MODELS_FOLDER +
                        "/{}_models.dump".format(train_component))
logger = logging.getLogger(__name__)

# 1. Command line arguments
args = sys.argv
is_first_time = args[1]
parse_type = args[2]
embedding_type = args[3]

# 2. Loading the data
train_seqs, train_y, test_seqs, test_y = get_input(is_first_time=is_first_time,
                                                   parse_type=parse_type)

hparams = HyperParams().get_cnn_hyper_params()

# 3. Transform the data using embedding vectors.
embedding = WordEmbedding(train_seqs, hparams.embedding_dim)
if embedding_type == 'doc2vec':
    model = embedding.get_d2v_model()
else:
    model = embedding.get_w2v_model()
train_X = embedding.get_embedding_mtx(model, train_seqs)
test_X = embedding.get_embedding_mtx(model, test_seqs)

# 4. Reshape
train_X = train_X.reshape(
    [-1, consts.MAX_SEQUENCE_LENGTH, hparams.embedding_dim])
train_y = train_y.reshape([-1, consts.NUM_LABELS])
test_X = test_X.reshape(
    [-1, consts.MAX_SEQUENCE_LENGTH, hparams.embedding_dim])
test_y = test_y.reshape([-1, consts.NUM_LABELS])