Ejemplo n.º 1
0
def preprocess_qa(source, nlp, w2vec_model, row_num=None, test=False):
    answer_n = len(source.answer_names)
    if row_num is None:
        quest_n_train = source.train_data.shape[0]
        quest_n_test = source.test_data.shape[0]
    else:
        quest_n_train, quest_n_test = row_num, row_num
    _helpScripts.print_msg('generate tree representations for qa')
    train_questions = source.train_data['question'].values[:row_num]
    train_answers = source.train_data[source.answer_names].values[:row_num]
    if test:
        test_questions = source.test_data['question'].values[:row_num]
        test_answers = source.test_data[source.answer_names].values[:row_num]

        sents_all = np.concatenate((train_questions, train_answers.flatten(),
                                    test_questions, test_answers.flatten()))
    else:
        sents_all = np.concatenate((train_questions, train_answers.flatten()))
    words, weights_trees, vocab, root_inds, weights_ = create_trees_all(sents_all, nlp)
    word_n = np.shape(weights_trees)[1]
    wc_num = len(weights_)

    mask_root_ind = get_root_ind_mask(root_inds, length=word_n)
    mask = np.zeros_like(mask_root_ind)
    for i, ri in enumerate(root_inds):
        mask[i, range(ri[-1] + 1)] = 1
    input_ = np.array([sents_to_w2v(sent, w2vec_model, word_n) for sent in words], dtype='float32')
    train_q, train_a = unflaffen_qa(input_[:quest_n_train*(1 + answer_n)],
                                           quest_n_train, answer_n)

    mask_train_ri_q, mask_train_ri_a = unflaffen_qa(mask_root_ind[:quest_n_train*(1 + answer_n)],
                                                                  quest_n_train, answer_n)
    mask_train_q, mask_train_a = unflaffen_qa(mask[:quest_n_train*(1 + answer_n)],
                                                   quest_n_train, answer_n)
    wt_train_q, wt_train_a = unflaffen_qa(weights_trees[:quest_n_train*(1 + answer_n)],
                                                        quest_n_train, answer_n)
    if test is None:
        test_q, test_a = None, None
        mask_test_ri_q, mask_test_ri_a = None, None
        mask_test_q, mask_test_a = None, None
        wt_test_q, wt_test_a = None, None
    else:
        test_q, test_a = unflaffen_qa(input_[quest_n_train*(1 + answer_n):],
                                             quest_n_train, answer_n)
        mask_test_ri_q, mask_test_ri_a = unflaffen_qa(mask_root_ind[quest_n_train*(1 + answer_n):],
                                                                      quest_n_train, answer_n)
        mask_test_q, mask_test_a = unflaffen_qa(mask[quest_n_train*(1 + answer_n):],
                                                       quest_n_train, answer_n)
        wt_test_q, wt_test_a = unflaffen_qa(weights_trees[quest_n_train*(1 + answer_n):],
                                                            quest_n_train, answer_n)

    correct_answ_ = source.train_data['correctAnswer'].values[:row_num]
    targets = int32([np.where(answ == np.array(source.answer_sym))[0][0] for answ in correct_answ_])
    return (train_q, train_a, test_q, test_a,
            wt_train_q, wt_train_a, wt_test_q, wt_test_a,
            mask_train_q, mask_train_a, mask_test_q, mask_test_a,
            mask_train_ri_q, mask_train_ri_a, mask_test_ri_q, mask_test_ri_a,
            vocab, word_n, wc_num, targets)
Ejemplo n.º 2
0
def pre_proc_quest(source, vocab, quest_data_file, row_num=None):
#-----------------------------------------------tokenize question / answer data
    _helpScripts.print_msg('generate question / answer data')

    quest_n_train = source.train_data.shape[0]
    quest_n_test = source.test_data.shape[0]

    answer_sym = ['A', 'B', 'C', 'D']
    answer_names = ['answer' + s for s in answer_sym]
    train_quest_tok = np.array([nltk.word_tokenize(sent) for sent in source.train_data['question'].values])
    test_quest_tok = np.array([nltk.word_tokenize(sent) for sent in source.test_data['question'].values])

    train_quest_sl = [len(t) for t in train_quest_tok]
    test_quest_sl = [len(t) for t in test_quest_tok]
    QUEST_SENT_L = np.max(train_quest_sl + test_quest_sl)

    train_answ_tok = np.transpose(np.array([[nltk.word_tokenize(sent) for sent in source.train_data[a].values] for a in answer_names]))
    test_answ_tok = np.transpose(np.array([[nltk.word_tokenize(sent) for sent in source.test_data[a].values] for a in answer_names]).reshape((-1, 4)))

    train_answ_sl = [len(t) for t in train_answ_tok.flatten()]
    test_answ_sl = [len(t) for t in test_answ_tok.flatten()]
    ANSW_SENT_L = np.max(train_answ_sl + test_answ_sl)
    print 'max question length is {}, answer length: {}'.format(QUEST_SENT_L, ANSW_SENT_L)
#-------------------------------------------------------generate the input data
    _helpScripts.print_msg('generate indexing for questions', allign='left')
    PAD_TOKEN_IND = len(vocab) - 1

    train_quest = np.expand_dims(int16(sents2indPar(train_quest_tok, vocab, proc=8, max_l=QUEST_SENT_L)), 1)
    test_quest = np.expand_dims(int16(sents2indPar(test_quest_tok, vocab, proc=8, max_l=QUEST_SENT_L)), 1)
    _helpScripts.print_msg('generate indexing for answers', allign='left')
    train_answ = int16(sents2indPar(train_answ_tok.reshape((-1)), vocab, proc=8, max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L))
    test_answ = int16(sents2indPar(test_answ_tok.reshape((-1)), vocab, proc=8, max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L))
    
    correct_answ_ = correct_answ=source.train_data['correctAnswer'].values
    train_correct_answ = float32([np.where(answ == np.array(answer_sym))[0][0] for answ in correct_answ_])
#-------------------------------------------------------------save data to file
    _helpScripts.print_msg('save data to file ' + quest_data_file, allign='left')
    with open(quest_data_file, 'wb') as f:
        cPickle.dump((train_quest, train_answ, train_correct_answ,
                      test_quest, test_answ), f, protocol=2)
    _helpScripts.print_msg('completed')

    return (train_quest, train_answ, train_correct_answ,
            test_quest, test_answ)
Ejemplo n.º 3
0
def pre_process_context(context_data_file, vocab_size=VOCAB_SIZE, row_num=None):
#-----------------------------------------------------------load whoosh results
    _helpScripts.print_msg('load whoosh results')
    with open(XAPIAN_DIR + 'train.pickle', 'rb') as f:
        wh_pred_answ_train, wh_scores_train, wh_themes_train = pickle.load(f)
    with open(XAPIAN_DIR + 'test.pickle', 'rb') as f:
        wh_pred_answ_test, wh_scores_test, wh_themes_test = pickle.load(f)

    if row_num is not None:
        wh_pred_answ_train, wh_scores_train, wh_themes_train = \
            wh_pred_answ_train[:row_num], wh_scores_train[:row_num], wh_themes_train[:row_num]
        wh_pred_answ_test, wh_scores_test, wh_themes_test = \
            wh_pred_answ_test[:row_num], wh_scores_test[:row_num], wh_themes_test[:row_num]

    themes_train_ = [wh_th.flatten() for wh_th in wh_themes_train]
    themes_train = set([item for sublist in themes_train_ for item in sublist])

    themes_test_ = [wh_th.flatten() for wh_th in wh_themes_test]
    themes_test = set([item for sublist in themes_test_ for item in sublist])
    themes_all = themes_train | themes_test
    themes_all -= set([0])
    themes_name_all = set([os.path.splitext(os.path.basename(th))[0] for th in list(themes_all)])
    print '{} articles are used from whoosh results'.format(len(themes_name_all))
#--------------------------------------------------------------tokenize results
    _helpScripts.print_msg('tokenize results')
    sentences_, art_names = tokenize_docs(list(themes_all), mode='append',
                                          add_sep=False,
                                          max_sent_length=MAX_SENT_LENGTH,
                                          min_sent_length=MIN_SENT_LENGTH)
    sentences = []
    sentences.extend(list(itertools.chain(*sentences_)))
    sent_l = [len(s) for s in sentences]
    print '{} sentences with sentence length: min = {}, max = {}, median = {}'.format(len(sent_l), np.min(sent_l), np.max(sent_l), np.median(sent_l))
#-------------------------------------------------------------------make source
    _helpScripts.print_msg('make source')
    source = _aai_data.Source(lower=True, create_sents=False)
    sentences.extend(source.sent_train_tokens)
    sentences.extend(source.sent_test_tokens)
    quest_n_train = source.train_data.shape[0]
    quest_n_test = source.test_data.shape[0]
    print 'there are {} train questions and {} test questions'.format(quest_n_train, quest_n_test)
#-----------------------------------------------------------generate vocabulary
    _helpScripts.print_msg('generate vocabulary')
    VOCAB = generate_vocab(sentences, vocab_size=vocab_size)
    vocab_size = len(VOCAB)
    PAD_TOKEN_IND = len(VOCAB) - 1
#-------------------------------------------------------------generate indexies
    _helpScripts.print_msg('select themes')    
    _helpScripts.print_msg('for train', allign='left')
    themes_train = select_themes(wh_scores_train, wh_themes_train, themes_quest=ARTICLES_QUEST)
    themes_train_names = [[os.path.splitext(os.path.basename(th))[0] for th in th_q] for th_q in themes_train]
    _helpScripts.print_msg('for test', allign='left')
    themes_test = select_themes(wh_scores_test, wh_themes_test, themes_quest=ARTICLES_QUEST)
    themes_test_names = [[os.path.splitext(os.path.basename(th))[0] for th in th_q] for th_q in themes_test]
#-------------------------------------------------------------generate indexies
    _helpScripts.print_msg('generate indexies')
    _helpScripts.print_msg('for whole context', allign='left')

    sent_count = [[len(sents) for sents in sentences_[i]] for i in range(len(sentences_)) if art_names[i] in themes_all]
    sent_count = list(itertools.chain(*sent_count))
    
#    MAX_ART_THEME = np.median(sent_count)
    sentences_ = np.array(sentences_)
#    themes_all = list(themes_all)
    article_inds = get_art_word_ind(sentences_,
                                    VOCAB, MAX_SENT_ART,
                                    MAX_SENT_LENGTH, PAD_TOKEN_IND)
#-------------------------------------------------------------save data to file
    _helpScripts.print_msg('save data to file ' + context_data_file, allign='left')
    with open(context_data_file, 'wb') as f:
        cPickle.dump((VOCAB, sentences_, art_names, source,
                      article_inds,
                      themes_train, themes_test), f, protocol=2)
    _helpScripts.print_msg('completed')

    return (VOCAB, sentences_, art_names, source,
            article_inds, 
            themes_train, themes_test)
Ejemplo n.º 4
0
    """
    return LL.get_output(encode_layer, inputs=X).eval()


def get_layer_by_name(net, name):
    """
    from https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.ipynb
    """
    for i, layer in enumerate(net.get_all_layers()):
        if layer.name == name:
            return layer, i
    return None, None


if __name__ == "__main__":
    _helpScripts.print_msg('start at time ' + strftime("%Y-%m-%d %H:%M:%S"))
#    print strftime("%Y-%m-%d %H:%M:%S")
    startTime = datetime.datetime.now()
    MAX_SAMPLES = None
    os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/')
    if not os.path.isdir(saveDir):
        os.makedirs(saveDir)
#--------------------------------------------------generate / load context data
    context_data_file = saveDir + 'context_data.cpickle'
    if os.path.isfile(context_data_file):
        _helpScripts.print_msg('load context data from file: ' + context_data_file)
        with open(context_data_file, 'rb') as f:
            (VOCAB, sentences_, art_names, source,
             article_inds, themes_train, themes_test) = cPickle.load(f)
    else:
        (VOCAB, sentences_, art_names, source,
Ejemplo n.º 5
0
    #    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logger = logging.getLogger()
    fhandler = logging.FileHandler(filename='./data/w2v_models/mylog.log',
                                   mode='a')
    formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
    fhandler.setFormatter(formatter)
    logger.addHandler(fhandler)
    logger.setLevel(logging.INFO)

    answer_sym = ['A', 'B', 'C', 'D']
    answer_names = ['answer' + s for s in answer_sym]

    #    quest_num_train = source.train_data.shape[0]

    if not os.path.isfile(w2vec_model_name):
        _helpScripts.print_msg('prepare the model')
        w2vec_model = Word2Vec(size=500,
                               alpha=0.1,
                               window=5,
                               min_count=5,
                               max_vocab_size=None,
                               sample=1e-5,
                               seed=1,
                               workers=16,
                               min_alpha=0.0001,
                               sg=1,
                               hs=1,
                               negative=5,
                               cbow_mean=0,
                               iter=2,
                               null_word=0,
Ejemplo n.º 6
0
    return scores__


def predict_answer(answer_sym, scores):
    pred_answ_ = np.argmax(scores, axis=1)
    pred_answ = [answer_sym[i] for i in pred_answ_]
    return pred_answ


if __name__ == "__main__":
    # os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/')
    #    corpus_tokens = get_all_corpus(search_dir)
    xapian_dir = './data/xapian/{0}/'.format(XAPIAN_VERSION)
    pmi_dir = './data/pmi/{0}/'.format(PMI_VERSION)
    # ------------------------------------------------------------------load results
    _helpScripts.print_msg('load results')
    _helpScripts.print_msg('pmi version {}'.format(PMI_VERSION), allign='left')
    with open(pmi_dir + 'train.pickle', 'rb') as f:
        (pmi_pred_answ_train, pmi_train, pmi_colls_train) = cPickle.load(f)
    with open(pmi_dir + 'test.pickle', 'rb') as f:
        (pmi_pred_answ_test, pmi_test, pmi_colls_test) = cPickle.load(f)

    _helpScripts.print_msg('xapian version {}'.format(XAPIAN_VERSION), allign='left')
    with open(xapian_dir + 'train.pickle', 'rb') as f:
        (x_pred_answ_train, x_train_scores, x_train_paths) = pickle.load(f)
    with open(xapian_dir + 'test.pickle', 'rb') as f:
        (x_pred_answ_test, x_test_scores, x_test_paths) = pickle.load(f)
    # ------------------------------------------------------------------------source
    source = _aai_data.Source(create_sents=False)
    # ---------------------------------------------------------------combine results
    x_train_scores_ = combine_scores_x(x_train_scores, mode='sum',
Ejemplo n.º 7
0
    return scores__


def predict_answer(answer_sym, scores):
    pred_answ_ = np.argmax(scores, axis=1)
    pred_answ = [answer_sym[i] for i in pred_answ_]
    return pred_answ


if __name__ == "__main__":
    # os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/')
    #    corpus_tokens = get_all_corpus(search_dir)
    xapian_dir = './data/xapian/{0}/'.format(XAPIAN_VERSION)
    pmi_dir = './data/pmi/{0}/'.format(PMI_VERSION)
    # ------------------------------------------------------------------load results
    _helpScripts.print_msg('load results')
    _helpScripts.print_msg('pmi version {}'.format(PMI_VERSION), allign='left')
    with open(pmi_dir + 'train.pickle', 'rb') as f:
        (pmi_pred_answ_train, pmi_train, pmi_colls_train) = cPickle.load(f)
    with open(pmi_dir + 'test.pickle', 'rb') as f:
        (pmi_pred_answ_test, pmi_test, pmi_colls_test) = cPickle.load(f)

    _helpScripts.print_msg('xapian version {}'.format(XAPIAN_VERSION),
                           allign='left')
    with open(xapian_dir + 'train.pickle', 'rb') as f:
        (x_pred_answ_train, x_train_scores, x_train_paths) = pickle.load(f)
    with open(xapian_dir + 'test.pickle', 'rb') as f:
        (x_pred_answ_test, x_test_scores, x_test_paths) = pickle.load(f)
    # ------------------------------------------------------------------------source
    source = _aai_data.Source(create_sents=False)
    # ---------------------------------------------------------------combine results
Ejemplo n.º 8
0
    return leave_idx


if __name__ == "__main__":
    os.chdir("/Users/lexx/Documents/Work/Kaggle/AllenAIScience/")

    save_dir = "./data/generated_sents/"
    source = _aai_data.Source()

    art_dirs_pre = u"./data/corpus/"
    wiki_dir = u"wiki_text_mod_spl_50/"
    ck12_full_spl_dir = u"ck12_full_themes_mod_spl_50/"
    ck12_wiki_spl_dir = u"wiki_text_ck12_themes_mod_spl_50/"
    art_dirs = [wiki_dir, ck12_full_spl_dir, ck12_wiki_spl_dir]

    _helpScripts.print_msg("Generate sentences")
    (sents_true, sents_false) = generate_sents_par(
        art_dirs_pre + art_dirs[2],
        pos_count=1,
        neg_count=3,
        rng=np.random.RandomState(156),
        save_dir=save_dir + art_dirs[2],
    )
    sents_true = np.array(sents_true)
    sents_false = np.array(sents_false)
    full_idx = get_full_sents(sents_true, sents_false)
    sents_true = sents_true[full_idx]
    sents_false = sents_false[full_idx]


def temp():
Ejemplo n.º 9
0
#    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logger = logging.getLogger()
    fhandler = logging.FileHandler(filename='./data/w2v_models/mylog.log', mode='a')
    formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
    fhandler.setFormatter(formatter)
    logger.addHandler(fhandler)
    logger.setLevel(logging.INFO)

    answer_sym = ['A', 'B', 'C', 'D']
    answer_names = ['answer' + s for s in answer_sym]

#    quest_num_train = source.train_data.shape[0]


    if not os.path.isfile(w2vec_model_name):
        _helpScripts.print_msg('prepare the model')        
        w2vec_model = Word2Vec(size=500, alpha=0.1, window=5, min_count=5,
                               max_vocab_size=None, sample=1e-5, seed=1, workers=16,
                               min_alpha=0.0001, sg=1, hs=1, negative=5,
                               cbow_mean=0, iter=2, null_word=0, trim_rule=None)

#        if os.path.isfile('./data/w2v_models/sentences'):
#            with open(sent_file, 'rb') as f:
#                sentences = pickle.load(f)
#        else:

        wiki_dir = './data/corpus/wiki_text_mod/'
        ck12_full_spl_dir = u'./data/corpus/ck12_full_themes_mod/'
        ck12_wiki_spl_dir = u'./data/corpus/wiki_text_ck12_themes_mod/'

        _helpScripts.print_msg('tokenize wiki', allign='left')
Ejemplo n.º 10
0
def pre_proc_quest(source, vocab, quest_data_file, row_num=None):
    #-----------------------------------------------tokenize question / answer data
    _helpScripts.print_msg('generate question / answer data')

    quest_n_train = source.train_data.shape[0]
    quest_n_test = source.test_data.shape[0]

    answer_sym = ['A', 'B', 'C', 'D']
    answer_names = ['answer' + s for s in answer_sym]
    train_quest_tok = np.array([
        nltk.word_tokenize(sent)
        for sent in source.train_data['question'].values
    ])
    test_quest_tok = np.array([
        nltk.word_tokenize(sent)
        for sent in source.test_data['question'].values
    ])

    train_quest_sl = [len(t) for t in train_quest_tok]
    test_quest_sl = [len(t) for t in test_quest_tok]
    QUEST_SENT_L = np.max(train_quest_sl + test_quest_sl)

    train_answ_tok = np.transpose(
        np.array([[
            nltk.word_tokenize(sent) for sent in source.train_data[a].values
        ] for a in answer_names]))
    test_answ_tok = np.transpose(
        np.array(
            [[nltk.word_tokenize(sent) for sent in source.test_data[a].values]
             for a in answer_names]).reshape((-1, 4)))

    train_answ_sl = [len(t) for t in train_answ_tok.flatten()]
    test_answ_sl = [len(t) for t in test_answ_tok.flatten()]
    ANSW_SENT_L = np.max(train_answ_sl + test_answ_sl)
    print 'max question length is {}, answer length: {}'.format(
        QUEST_SENT_L, ANSW_SENT_L)
    #-------------------------------------------------------generate the input data
    _helpScripts.print_msg('generate indexing for questions', allign='left')
    PAD_TOKEN_IND = len(vocab) - 1

    train_quest = np.expand_dims(
        int16(sents2indPar(train_quest_tok, vocab, proc=8,
                           max_l=QUEST_SENT_L)), 1)
    test_quest = np.expand_dims(
        int16(sents2indPar(test_quest_tok, vocab, proc=8, max_l=QUEST_SENT_L)),
        1)
    _helpScripts.print_msg('generate indexing for answers', allign='left')
    train_answ = int16(
        sents2indPar(train_answ_tok.reshape((-1)),
                     vocab,
                     proc=8,
                     max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L))
    test_answ = int16(
        sents2indPar(test_answ_tok.reshape((-1)),
                     vocab,
                     proc=8,
                     max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L))

    correct_answ_ = correct_answ = source.train_data['correctAnswer'].values
    train_correct_answ = float32([
        np.where(answ == np.array(answer_sym))[0][0] for answ in correct_answ_
    ])
    #-------------------------------------------------------------save data to file
    _helpScripts.print_msg('save data to file ' + quest_data_file,
                           allign='left')
    with open(quest_data_file, 'wb') as f:
        cPickle.dump((train_quest, train_answ, train_correct_answ, test_quest,
                      test_answ),
                     f,
                     protocol=2)
    _helpScripts.print_msg('completed')

    return (train_quest, train_answ, train_correct_answ, test_quest, test_answ)
Ejemplo n.º 11
0
def pre_process_context(context_data_file,
                        vocab_size=VOCAB_SIZE,
                        row_num=None):
    #-----------------------------------------------------------load whoosh results
    _helpScripts.print_msg('load whoosh results')
    with open(XAPIAN_DIR + 'train.pickle', 'rb') as f:
        wh_pred_answ_train, wh_scores_train, wh_themes_train = pickle.load(f)
    with open(XAPIAN_DIR + 'test.pickle', 'rb') as f:
        wh_pred_answ_test, wh_scores_test, wh_themes_test = pickle.load(f)

    if row_num is not None:
        wh_pred_answ_train, wh_scores_train, wh_themes_train = \
            wh_pred_answ_train[:row_num], wh_scores_train[:row_num], wh_themes_train[:row_num]
        wh_pred_answ_test, wh_scores_test, wh_themes_test = \
            wh_pred_answ_test[:row_num], wh_scores_test[:row_num], wh_themes_test[:row_num]

    themes_train_ = [wh_th.flatten() for wh_th in wh_themes_train]
    themes_train = set([item for sublist in themes_train_ for item in sublist])

    themes_test_ = [wh_th.flatten() for wh_th in wh_themes_test]
    themes_test = set([item for sublist in themes_test_ for item in sublist])
    themes_all = themes_train | themes_test
    themes_all -= set([0])
    themes_name_all = set(
        [os.path.splitext(os.path.basename(th))[0] for th in list(themes_all)])
    print '{} articles are used from whoosh results'.format(
        len(themes_name_all))
    #--------------------------------------------------------------tokenize results
    _helpScripts.print_msg('tokenize results')
    sentences_, art_names = tokenize_docs(list(themes_all),
                                          mode='append',
                                          add_sep=False,
                                          max_sent_length=MAX_SENT_LENGTH,
                                          min_sent_length=MIN_SENT_LENGTH)
    sentences = []
    sentences.extend(list(itertools.chain(*sentences_)))
    sent_l = [len(s) for s in sentences]
    print '{} sentences with sentence length: min = {}, max = {}, median = {}'.format(
        len(sent_l), np.min(sent_l), np.max(sent_l), np.median(sent_l))
    #-------------------------------------------------------------------make source
    _helpScripts.print_msg('make source')
    source = _aai_data.Source(lower=True, create_sents=False)
    sentences.extend(source.sent_train_tokens)
    sentences.extend(source.sent_test_tokens)
    quest_n_train = source.train_data.shape[0]
    quest_n_test = source.test_data.shape[0]
    print 'there are {} train questions and {} test questions'.format(
        quest_n_train, quest_n_test)
    #-----------------------------------------------------------generate vocabulary
    _helpScripts.print_msg('generate vocabulary')
    VOCAB = generate_vocab(sentences, vocab_size=vocab_size)
    vocab_size = len(VOCAB)
    PAD_TOKEN_IND = len(VOCAB) - 1
    #-------------------------------------------------------------generate indexies
    _helpScripts.print_msg('select themes')
    _helpScripts.print_msg('for train', allign='left')
    themes_train = select_themes(wh_scores_train,
                                 wh_themes_train,
                                 themes_quest=ARTICLES_QUEST)
    themes_train_names = [[
        os.path.splitext(os.path.basename(th))[0] for th in th_q
    ] for th_q in themes_train]
    _helpScripts.print_msg('for test', allign='left')
    themes_test = select_themes(wh_scores_test,
                                wh_themes_test,
                                themes_quest=ARTICLES_QUEST)
    themes_test_names = [[
        os.path.splitext(os.path.basename(th))[0] for th in th_q
    ] for th_q in themes_test]
    #-------------------------------------------------------------generate indexies
    _helpScripts.print_msg('generate indexies')
    _helpScripts.print_msg('for whole context', allign='left')

    sent_count = [[len(sents) for sents in sentences_[i]]
                  for i in range(len(sentences_))
                  if art_names[i] in themes_all]
    sent_count = list(itertools.chain(*sent_count))

    #    MAX_ART_THEME = np.median(sent_count)
    sentences_ = np.array(sentences_)
    #    themes_all = list(themes_all)
    article_inds = get_art_word_ind(sentences_, VOCAB, MAX_SENT_ART,
                                    MAX_SENT_LENGTH, PAD_TOKEN_IND)
    #-------------------------------------------------------------save data to file
    _helpScripts.print_msg('save data to file ' + context_data_file,
                           allign='left')
    with open(context_data_file, 'wb') as f:
        cPickle.dump((VOCAB, sentences_, art_names, source, article_inds,
                      themes_train, themes_test),
                     f,
                     protocol=2)
    _helpScripts.print_msg('completed')

    return (VOCAB, sentences_, art_names, source, article_inds, themes_train,
            themes_test)
Ejemplo n.º 12
0
    """
    return LL.get_output(encode_layer, inputs=X).eval()


def get_layer_by_name(net, name):
    """
    from https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.ipynb
    """
    for i, layer in enumerate(net.get_all_layers()):
        if layer.name == name:
            return layer, i
    return None, None


if __name__ == "__main__":
    _helpScripts.print_msg('start at time ' + strftime("%Y-%m-%d %H:%M:%S"))
    #    print strftime("%Y-%m-%d %H:%M:%S")
    startTime = datetime.datetime.now()
    MAX_SAMPLES = None
    os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/')
    if not os.path.isdir(saveDir):
        os.makedirs(saveDir)
#--------------------------------------------------generate / load context data
    context_data_file = saveDir + 'context_data.cpickle'
    if os.path.isfile(context_data_file):
        _helpScripts.print_msg('load context data from file: ' +
                               context_data_file)
        with open(context_data_file, 'rb') as f:
            (VOCAB, sentences_, art_names, source, article_inds, themes_train,
             themes_test) = cPickle.load(f)
    else:
Ejemplo n.º 13
0
    return leave_idx


if __name__ == '__main__':
    os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/')

    save_dir = './data/generated_sents/'
    source = _aai_data.Source()

    art_dirs_pre = u'./data/corpus/'
    wiki_dir = u'wiki_text_mod_spl_50/'
    ck12_full_spl_dir = u'ck12_full_themes_mod_spl_50/'
    ck12_wiki_spl_dir = u'wiki_text_ck12_themes_mod_spl_50/'
    art_dirs = [wiki_dir, ck12_full_spl_dir, ck12_wiki_spl_dir]

    _helpScripts.print_msg('Generate sentences')
    (sents_true, sents_false) = generate_sents_par(art_dirs_pre+art_dirs[2],
                                               pos_count=1, neg_count=3,
                                               rng=np.random.RandomState(156),
                                               save_dir=save_dir+art_dirs[2])
    sents_true = np.array(sents_true)
    sents_false = np.array(sents_false)
    full_idx = get_full_sents(sents_true, sents_false)
    sents_true = sents_true[full_idx]
    sents_false = sents_false[full_idx]
    


def temp():
    art_files = [glob.glob(art_dirs_pre + art_dir + '*.txt') for art_dir in art_dirs]
    
Ejemplo n.º 14
0
    """
    return LL.get_output(encode_layer, inputs=X).eval()


def get_layer_by_name(net, name):
    """
    from https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.ipynb
    """
    for i, layer in enumerate(net.get_all_layers()):
        if layer.name == name:
            return layer, i
    return None, None


if __name__ == '__main__':
    _helpScripts.print_msg('start at time ' + strftime("%Y-%m-%d %H:%M:%S"))
#    print strftime("%Y-%m-%d %H:%M:%S")
    startTime = datetime.datetime.now()

#    os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/')
    if not os.path.isdir(saveDir):
        os.makedirs(saveDir)
    nlp = _spEnglish(data_dir=os.environ.get('SPACY_DATA', _spLOCAL_DATA_DIR))
#--------------------------------------------------------------load w2vec model
    w2vec_model_name = W2VEC_DIR + 'm{}'.format(W2VEC_VERSION)
    _helpScripts.print_msg('load the word2vec model from', w2vec_model_name)
    w2vec_model = Word2Vec.load(w2vec_model_name)
    w2vec_model.init_sims(replace=True)
    repr_length = w2vec_model.syn0.shape[1]
#--------------------------------------------------generate / load context data
#-------------------------------------------------------------------make source
Ejemplo n.º 15
0
def preprocess_qa(source, nlp, w2vec_model, row_num=None, test=False):
    answer_n = len(source.answer_names)
    if row_num is None:
        quest_n_train = source.train_data.shape[0]
        quest_n_test = source.test_data.shape[0]
    else:
        quest_n_train, quest_n_test = row_num, row_num
    _helpScripts.print_msg('generate tree representations for qa')
    train_questions = source.train_data['question'].values[:row_num]
    train_answers = source.train_data[source.answer_names].values[:row_num]
    if test:
        test_questions = source.test_data['question'].values[:row_num]
        test_answers = source.test_data[source.answer_names].values[:row_num]

        sents_all = np.concatenate(
            (train_questions, train_answers.flatten(), test_questions,
             test_answers.flatten()))
    else:
        sents_all = np.concatenate((train_questions, train_answers.flatten()))
    words, weights_trees, vocab, root_inds, weights_ = create_trees_all(
        sents_all, nlp)
    word_n = np.shape(weights_trees)[1]
    wc_num = len(weights_)

    mask_root_ind = get_root_ind_mask(root_inds, length=word_n)
    mask = np.zeros_like(mask_root_ind)
    for i, ri in enumerate(root_inds):
        mask[i, range(ri[-1] + 1)] = 1
    input_ = np.array(
        [sents_to_w2v(sent, w2vec_model, word_n) for sent in words],
        dtype='float32')
    train_q, train_a = unflaffen_qa(input_[:quest_n_train * (1 + answer_n)],
                                    quest_n_train, answer_n)

    mask_train_ri_q, mask_train_ri_a = unflaffen_qa(
        mask_root_ind[:quest_n_train * (1 + answer_n)], quest_n_train,
        answer_n)
    mask_train_q, mask_train_a = unflaffen_qa(
        mask[:quest_n_train * (1 + answer_n)], quest_n_train, answer_n)
    wt_train_q, wt_train_a = unflaffen_qa(
        weights_trees[:quest_n_train * (1 + answer_n)], quest_n_train,
        answer_n)
    if test is None:
        test_q, test_a = None, None
        mask_test_ri_q, mask_test_ri_a = None, None
        mask_test_q, mask_test_a = None, None
        wt_test_q, wt_test_a = None, None
    else:
        test_q, test_a = unflaffen_qa(input_[quest_n_train * (1 + answer_n):],
                                      quest_n_train, answer_n)
        mask_test_ri_q, mask_test_ri_a = unflaffen_qa(
            mask_root_ind[quest_n_train * (1 + answer_n):], quest_n_train,
            answer_n)
        mask_test_q, mask_test_a = unflaffen_qa(
            mask[quest_n_train * (1 + answer_n):], quest_n_train, answer_n)
        wt_test_q, wt_test_a = unflaffen_qa(
            weights_trees[quest_n_train * (1 + answer_n):], quest_n_train,
            answer_n)

    correct_answ_ = source.train_data['correctAnswer'].values[:row_num]
    targets = int32([
        np.where(answ == np.array(source.answer_sym))[0][0]
        for answ in correct_answ_
    ])
    return (train_q, train_a, test_q, test_a, wt_train_q, wt_train_a,
            wt_test_q, wt_test_a, mask_train_q, mask_train_a, mask_test_q,
            mask_test_a, mask_train_ri_q, mask_train_ri_a, mask_test_ri_q,
            mask_test_ri_a, vocab, word_n, wc_num, targets)
Ejemplo n.º 16
0
    """
    return LL.get_output(encode_layer, inputs=X).eval()


def get_layer_by_name(net, name):
    """
    from https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.ipynb
    """
    for i, layer in enumerate(net.get_all_layers()):
        if layer.name == name:
            return layer, i
    return None, None


if __name__ == '__main__':
    _helpScripts.print_msg('start at time ' + strftime("%Y-%m-%d %H:%M:%S"))
    #    print strftime("%Y-%m-%d %H:%M:%S")
    startTime = datetime.datetime.now()

    #    os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/')
    if not os.path.isdir(saveDir):
        os.makedirs(saveDir)
    nlp = _spEnglish(data_dir=os.environ.get('SPACY_DATA', _spLOCAL_DATA_DIR))
    #--------------------------------------------------------------load w2vec model
    w2vec_model_name = W2VEC_DIR + 'm{}'.format(W2VEC_VERSION)
    _helpScripts.print_msg('load the word2vec model from', w2vec_model_name)
    w2vec_model = Word2Vec.load(w2vec_model_name)
    w2vec_model.init_sims(replace=True)
    repr_length = w2vec_model.syn0.shape[1]
    #--------------------------------------------------generate / load context data
    #-------------------------------------------------------------------make source