Esempio n. 1
0
def prep_data():
    input_texts, mesh_outputs = load_data.assemble_pairs()
    abstract_p = preprocess.Preprocessor()
    
    # preprocess and encode texts (inputs)
    abstract_p.preprocess(input_texts)
    X = abstract_p.encode_texts(input_texts)

    labels_p = preprocess.Preprocessor(vocab_size=None, split_char=".", normalize=False)
    labels_p.preprocess(mesh_outputs)
    Y = labels_p.encode_texts(mesh_outputs)

    return (input_texts, abstract_p, mesh_outputs, labels_p, list(zip(X,Y)))
Esempio n. 2
0
def test():
    '''
    Trains the model and returns its score
    '''
    matplotlib.rcParams['backend'] = 'Qt5Agg'
    matplotlib.get_backend()
    D = DataManager(data_name, data_dir)
    #Load le model
    mdl = model()

    Prepro = prepro.Preprocessor()
    #D.data['X_train'] = Prepro.removeOutliers(D.data['X_train'])
    #D.data['Y_train'] = Prepro.removeOutliers(D.data['Y_train'])
    X_train = D.data['X_train']
    Y_train = D.data['Y_train'].ravel()

    #test de l'entrainement
    mdl.fit(X_train, Y_train)

    #test de la prediction
    Y_hat_train = mdl.predict(D.data['X_train'])
    Y_hat_valid = mdl.predict(D.data['X_valid'])
    Y_hat_test = mdl.predict(D.data['X_test'])

    metric_name, scoring_function = get_metric()
    scores = cross_val_score(mdl,
                             X_train,
                             Y_train,
                             cv=5,
                             scoring=make_scorer(scoring_function))
    print('\nCV score (95 perc. CI): %0.2f (+/- %0.2f)' %
          (scores.mean(), scores.std() * 2))
Esempio n. 3
0
def jsd(fileName, query):
    pp = preprocess.Preprocessor()
    fileText = pp.prepDoc(fileName, combine = True)
    queryText = pp.preprocess(query)
    texts = [queryText, fileText]
    probDists = getProbDists(texts)
    return jensenshannon.jensen_shannon_divergence(numpy.array(probDists))
Esempio n. 4
0
def testFile(fileName, query):
    pp = preprocess.Preprocessor()
    fileText = pp.prepDoc(fileName, combine=True)
    #print fileText
    queryText = pp.preprocess(query)
    texts = [queryText, fileText]
    #print texts
    probDists = getProbDists(texts)
    #print probDists
    print jensenshannon.jensen_shannon_divergence(numpy.array(probDists))
Esempio n. 5
0
def data_prep(seed):
    profile = profile.Profile()
    interest = interest.Interest()
    preprocess = preprocess.Preprocessor()
    profile_raw = profile.get_profile()
    interest_raw, ids = interest.data_merge()
    data = preprocess.finalize_data(profile_raw, interest_raw)
    X, y, X_train, y_train, X_test, y_test = preprocess.split_data(data,
                                                                   seed=seed,
                                                                   re=False)
    return X, y, X_train, y_train, X_test, y_test, ids
Esempio n. 6
0
def preprocess_data(url, seed):
    preprocessor = preprocess.Preprocessor()
    raw_data = preprocessor.get_data(url)
    contain_null = preprocessor.get_null(raw_data)
    for f in contain_null:
        raw_data.loc[(raw_data[f].isnull()),
                     f] = preprocessor.ImputeVoteClassifier(raw_data, f)
    X_train, y_train, X_test, y_test = preprocessor.split_data(raw_data,
                                                               seed,
                                                               re=False)
    return X_train, y_train, X_test, y_test
Esempio n. 7
0
def main():
    preprocessor = preprocess.Preprocessor(
        '{}/words.txt'.format(cwd))  # preprocess is called
    ''' poor data sets: '''
    # preprocessor2 = preprocess.Preprocessor('{}/2-letter-words.json'.format(cwd))
    # preprocessor3 = preprocess.Preprocessor('{}/3-letter-words.json'.format(cwd))
    # preprocessor4 = preprocess.Preprocessor('{}/4-letter-words.json'.format(cwd))
    # preprocessor5 = preprocess.Preprocessor('{}/5-letter-words.json'.format(cwd))
    # preprocessor6 = preprocess.Preprocessor('{}/6-letter-words.json'.format(cwd))
    # preprocessor7 = preprocess.Preprocessor('{}/7-letter-words.json'.format(cwd))
    # preprocessor8 = preprocess.Preprocessor('{}/8-letter-words.json'.format(cwd))
    # preprocessor9 = preprocess.Preprocessor('{}/9-letter-words.json'.format(cwd))
    # preprocessor10 = preprocess.Preprocessor('{}/10-letter-words.json'.format(cwd))
    # preprocessor11 = preprocess.Preprocessor('{}/11-letter-words.json'.format(cwd))
    # preprocessor12 = preprocess.Preprocessor('{}/12-letter-words.json'.format(cwd))
    vocabpreprocessor = preprocess.Preprocessor('{}/vocab.txt'.format(cwd))
    moreWords = preprocess.Preprocessor(
        '{}/entriesWithCollocates.txt'.format(cwd))
    global wordDict
    '''preprocessor.processedWords +'''
    # wordDict = preprocessor.processedWords + preprocessor5.processedWords + preprocessor6.processedWords + preprocessor7.processedWords + preprocessor8.processedWords + preprocessor9.processedWords + preprocessor10.processedWords + preprocessor11.processedWords + preprocessor12.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords
    wordDict = preprocessor.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords
    # set the word dict so the game can find the best guesses
    # print 'PROCESSED WORDS={}'.format(preprocessor.processedWords)
    game = Game()  # starts the game

    while 1:
        GUESS = game.getNextBestGuess()
        game.guess(GUESS)
    # try:
    # 	while 1:
    # 		GUESS = game.getNextBestGuess()
    # 		game.guess(GUESS)
    # except:
    # 	print '\nGAME ENDED'
    return 0
Esempio n. 8
0
 def get_data(
     self,
     dsn_database,
     dsn_hostname,
     dsn_port,
     dsn_protocol,
     dsn_uid,
     dsn_pwd,
     level,
 ):
     preprocess = preprocess.Preprocessor()
     raw_data = preprocess.db2_connect(
         dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd
     )
     data = preprocess.data_preprocess(raw_data, level)
     return data
Esempio n. 9
0
    def compress(self, samFilename, compressedFilename, gtf, min_filename, frag_len_z_cutoff, split_diff_strands, split_discordant):
        ''' Compresses the alignments to 2 files, one for unspliced and one for spliced

            file_prefix: Prefix for all output file names
        '''

        self.p = preprocess.Preprocessor(samFilename, frag_len_z_cutoff, split_diff_strands)

        if not self.frag_len_cutoff:
            self.frag_len_cutoff = self.p.frag_len_cutoff
        print('Using fragment length cutoff of ' + str(self.frag_len_cutoff))

        if split_diff_strands:
            print('Splitting mates on different strands')
        else:
            print('Not splitting mates on different strands')

        if split_discordant:
            print('Splitting discordant')
        else:
            print('Not splitting discordant')

        # Reads on different strands that should be unpaired
        self.diff_strand_unpaired = self.p.unpaired
        del self.p

        # Read header
        header = ''
        with open(samFilename, 'r') as f:
            for line in f:
                if line[0] == '@':
                    header += line
                else:
                    break
        self.chromosomes = self.parseSAMHeader(header)
        self.aligned = alignments.Alignments(self.chromosomes, self.frag_len_cutoff, split_discordant)

        if gtf:
            self.aligned.gtf_exons = self.parseGTF(gtf, self.aligned.chromOffsets)

        self.compressByBundle(samFilename, compressedFilename, min_filename)

        #print('%d unmatched' % self.aligned.numUnmatched)
        print('Approximately %d / %d = %f%% of compressed file is coverage' % (self.covSize, self.totalSize, 100.0*float(self.covSize)/float(self.totalSize)))
        print('Finished compressing')
Esempio n. 10
0
def gate_value_report_write(fname,evids_ids,fact_ids,gate_v):
    '''
    用于记录gate值和生成事实之间的对应关系,每一个事实对应一个生成时的最佳证据编号
    :param fname: 文件名
    :param evids_ids: 证据的id序列
    :param fact_ids:  事实id序列
    :param gate_v: 门控值
    :return:
    '''
    p = preprocess.Preprocessor(False)
    fact = p.get_char_list(fact_ids)


    evids = []
    e_w = []
    for e in evids_ids:
        if e[0] == 2:
            e_w.append(0)
            for i in range(len(e)):
                if e[i] == 1:
                    e = e[:i]
                    break
            evids.append(p.get_sentence(e))
        else:
            break
    f = open(fname,'a',encoding='utf-8')
    fact_len = 0
    for g_i in range(len(gate_v)):
        if int(fact_ids[g_i])==1:
            break
        fact_len+=1
        e_w[gate_v[g_i]]+=1
    for i in range(len(evids)):

        f.write('%d\t%s'%(e_w[i],evids[i]))
        f.write('\n')
    for g in range(fact_len):
        f.write('%d\t'%gate_v[g])
    f.write('\n')
    for f_c in fact:
        f.write(f_c+'\t')
    f.write('\n')

    f.close()
    def __init__(self):
        """Main class for antiderivative detection."""
        app_id = 'LHLP7U-HHLKWGU3AT'.lower()

        self._wolfram_client = wolframalpha.Client(app_id)
        self.img_input = None  # type: t.Optional[np.ndarray]
        self.img_solved = None  # type: t.Optional[np.ndarray]
        self.img_segments = None  # type: t.Optional[t.Sequence[np.ndarray]]

        self.models = self._load_models(path=os.path.join(
            os.path.realpath(__file__)[:-len(os.path.basename(__file__))],
            "models"))

        self._preprocessor = preprocess.Preprocessor()
        self._postprocessor = postprocess.Postprocessor()

        # Must have correspondence with the class codification
        # used to train the CNN model loaded just above. Don't
        # change the symbol order.
        self._CLASS_SYMBOL = (
            "0",
            "1",
            "x",
            "+",
            "-",
            "/",
            "(",
            ")",
            "e",
            "integrate",
            "d",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
        )

        self._RE_FIX_DNOTATION = re.compile(r"(?<=d)\s+(?=.)")
Esempio n. 12
0
 def __init__(self):
     '''
     fancy_classifier = Pipeline([
                 ('preprocessing', Preprocessor()),
                 ('classification', RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=0))
                 ])
     self.clf = VotingClassifier(estimators=[
                 ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
                 ('Gaussian Classifier', GaussianNB()),
                 ('Support Vector Machine', SVC(probability=True)),
                 ('Fancy Classifier', fancy_classifier)],
                 voting='soft')   
     '''
     self.mdl = RandomForestClassifier(n_estimators=136,
                                       max_depth=None,
                                       min_samples_split=2,
                                       random_state=0)
     self.num_train_samples = 0
     self.num_feat = 1
     self.num_labels = 1
     self.prep = prepro.Preprocessor()
Esempio n. 13
0
            opt_lines = 1

    if not opt_system:
        if opt_testing:
            # If we are testing just use the lingogi file
            opt_system = os.path.join('platforms', 'lingogi', 'system.h')
        else:
            err("You need to specify -s, see -h")
            sys.exit(2)

    if not os.path.exists(opt_system):
        err("'%s' does not exist" % opt_system)
        sys.exit(2)

    # First do all preprocessing from pch.h
    processor = pp.Preprocessor()
    processor.__setitem__("PRODUCT_SYSTEM_FILE", '"' + opt_system + '"')
    processor.addUserIncludePath(".")
    processor.ignoreErrors()

    # If defines are specified we parse them here
    # currently treated as one space separated string
    # and we replace \" with " in string defines.
    #
    # FIXME: The python macros store the name
    # and value, the current code is pike inherited
    # and thus redundantly stores the name twice
    #
    if opt_defines:
        for define in opt_defines:
            if len(define):
Esempio n. 14
0
        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)

if __name__ == '__main__':
    tmp_path="D:/dhm/programer-lx/BiDAF_tf2"
    ds = preprocess.Preprocessor([
        tmp_path+'/data/squad/train-v1.1.json',
        tmp_path+'/data/squad/dev-v1.1.json',
        tmp_path+'/data/squad/dev-v1.1.json'
    ])

##    train_c, train_q, train_y = ds.get_dataset(tmp_path+'/data/squad/train-v1.1.json')
##    test_c, test_q, test_y = ds.get_dataset(tmp_path+'/data/squad/dev-v1.1.json')
    train_cc, train_cq, train_wc, train_wq, train_y = ds.get_dataset(tmp_path+'/data/squad/test.json')
    test_cc, test_cq, test_wc, test_wq, test_y = ds.get_dataset(tmp_path+'/data/squad/test.json')

    bidaf = BiDAF(
        clen=ds.max_clen,
        qlen=ds.max_qlen,
        emb_size=50,
        max_features=len(ds.charset),  # ds.charset
        vocab_size=len(ds.word_list),
        conv_layers=[[10, 1], [10, 2], [30, 3]],  # 卷积的大小及个数
Esempio n. 15
0
        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor(
        ['./data/squad/train-v1.1.json', './data/squad/dev-v1.1.json'],
        ['./data/glove.6B/glove.6B.50d.txt'])
    '''
    train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json')
    test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json')
    

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    '''

    bidaf = BiDAF(
        clen=ds.max_clen,
        qlen=ds.max_qlen,
        emb_size=50,
Esempio n. 16
0
def main():
    preprocessor = preprocess.Preprocessor(
        '{}/2-letter-words.json'.format(cwd))  # preprocess is called
    print preprocessor.processedWords
Esempio n. 17
0
test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
"""predata_onehot = pr.Preprocessor(predata).all("onehot")"""
predata_label = pr.Preprocessor(predata_copy).all("label", "date")
"""prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]], axis=1)
prep_test_onehot = predata_onehot.iloc[len(train):, :]"""

prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
prep_test_label = predata_label.iloc[len(train):, :]
"""prep_train_onehot.to_csv("../prep_train_onehot.csv", index=False)
prep_test_onehot.to_csv("../prep_test_onehot.csv", index=False)
prep_train_label.to_csv("../prep_train_label.csv", index=False)
prep_test_label.to_csv("../prep_test_label.csv", index=False)"""
""" define data"""
train_X = prep_train_label.drop([
    "y", "video_id", "thumbnail_link", "publishedAt", "collection_date", "id",
    "tags", "description", "title"
],
                                axis=1)
Esempio n. 18
0
def test(string1, string2):
    pp = preprocess.Preprocessor()
    texts = [pp.preprocess(string1), pp.preprocess(string2)]

    #print dictionary.token2id
    getProbDists(texts)
import os
import re

import numpy as np
import skimage
import imageio

sys.path.insert(0, "../antideriv")
import preprocess as antideriv_preproc # noqa: ignore


OUTPUT_PATH = "./data-augmented-preprocessed"
RE_CLASS_NAME = re.compile(r"(?<=class_)[^_]+")
OUTPUT_FILE_TYPE = "png"

PREPROCESSOR_MODEL = antideriv_preproc.Preprocessor()
"""Preprocess the training data the same way as a regular input."""


def resize(img: np.ndarray,
           output_shape: t.Tuple[int, int] = (45, 45)) -> np.ndarray:
    """Resize image to ``output_shape`` with interpolation of order 3."""
    img = skimage.transform.resize(
        image=img,
        output_shape=output_shape,
        anti_aliasing=False,
        order=3)

    return img

Esempio n. 20
0
        end_probability = y_pred_end[end_idx]

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor(
        ['./data/train.json', './data/dev.json', './data/test.json'])

    train_c, train_q, train_y = ds.get_dataset('./data/train.json')
    test_c, test_q, test_y = ds.get_dataset('./data/dev.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=128,
                  max_features=len(ds.charset))
    bidaf.build_model()
    bidaf.model.fit([train_c, train_q],
                    train_y,
                    batch_size=16,
Esempio n. 21
0
    )
    with open(input_file, 'rb') as f:
        lang_data = pickle.load(f)
    new_data = [[preprocess.preprocess_sentence(w) for w in l.split('\t')]
                for l in lang_data[:NUM_DATA]]

    label_holder = []
    input_sentences = []
    for line in new_data:
        labels = postprocess.sentence_labeller(line[0], line[1])
        label_holder.append(labels)
        input_sentences.append(line[1])

        #label_holder = np.array(label_holder)
    #Pre-process the data
    data_holder = preprocess.Preprocessor(lang_data, NUM_DATA, 'TRAIN')
    _, target_dataset, _, output_table, _, max_length_tar, _, _, _, output_index2word, target_lengths = data_holder.finalise_dataset(
    )

    train_targets, val_targets, train_labels, val_labels, train_lengths, val_lengths = train_test_split(
        target_dataset, label_holder, target_lengths, test_size=TEST_SPLIT)
    #Feeding the data in reverse order helps with training
    #input_dataset = np.flip(input_dataset)

    #Create a dataset
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        train_targets, maxlen=max_length_tar, padding='post')
    label_holder = tf.keras.preprocessing.sequence.pad_sequences(
        train_labels, maxlen=max_length_tar, padding='post')
    padded_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        train_labels, maxlen=max_length_tar, padding='post')
Esempio n. 22
0
File: dep.py Progetto: xyuan/amrex
                        help="output a detailed log file describing each source file",
                        action="store_true")
    parser.add_argument("files", metavar="source files", type=str, nargs="*",
                        help="F90 source files to find dependencies amongst")

    args = parser.parse_args()

    if args.prefix != "":
        prefix_pass = "******".format(os.path.normpath(args.prefix))
    else:
        prefix_pass = "******"

    if args.temp_dir != "":
        temp_dir = args.temp_dir
    else:
        temp_dir = "./"

    # create a preprocessor object
    if args.cpp != "":
        cpp_pass = preprocess.Preprocessor(temp_dir=temp_dir, cpp_cmd=args.cpp,
                                           defines=args.defines,
                                           f90_preprocess=args.f90_preprocess)
    else:
        cpp_pass = None

    try:
        doit(prefix_pass, args.search_path.split(), args.files, cpp_pass, debug=args.debug)
    except:
        # something went wrong
        print("$(error something went wrong in dep.py.  Remake, adding the option 'DEP_CHECK_OPTS=--debug' to your make command and examine the 'dependencies.out' file)")
        if prediction_word == '<EOS>':
            return decoded_text, sentence, attention_matrix

        decoder_input = tf.expand_dims([prediction_id],0)

    return decoded_text, sentence, attention_matrix

if __name__ == '__main__':

    input_file = os.path.join('/Users/emielzyde/Desktop/Project/grammar_correction/lang8_preprocess.pickle')
    with open(input_file, 'rb') as f:
        #lang_data = f.readlines()
        lang_data = pickle.load(f)
        #lang_data = lang_data.readlines()
    #Pre-process the data
    data_holder = preprocess.Preprocessor(lang_data, 2000, 'TRAIN')
    input_dataset, target_dataset, input_table, output_table, max_length_inp, max_length_tar, input_word2index, output_word2index, input_index2word, output_index2word = data_holder.finalise_dataset()

    train_input_dataset, val_input_dataset, train_target_dataset, val_target_dataset = train_test_split(input_dataset, target_dataset, test_size = TEST_SPLIT)
    #Feeding the data in reverse order helps with training
    #input_dataset = np.flip(input_dataset)

    print('The vocabulary size is {}'.format(len(input_word2index)))

    #Create a dataset
    number_batches = len(train_input_dataset) // BATCH_SIZE
    input_vocab_size = len(input_table.word2index)
    target_vocab_size = len(output_table.word2index)
    dataset = tf.data.Dataset.from_tensor_slices((train_input_dataset, train_target_dataset)).shuffle(len(train_input_dataset))
    dataset = dataset.batch(BATCH_SIZE, drop_remainder= True)
Esempio n. 24
0
        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor([
        './data/squad/train-v1.1.json', './data/squad/dev-v1.1.json',
        './data/squad/dev-v1.1.json'
    ])
    train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json')
    test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    #加载样本中样本的GLove词向量和字符char的初始化
    train_c, train_q, train_y = ds.get_chardataset(
        './data/squad/train-v1.1.json')

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=50,
                  max_features=len(ds.charset))
Esempio n. 25
0
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
import time
import os
import preprocess
import LSTM
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# file = open('./figures/output1.txt', 'w')
# sys.stdout = file

if __name__ == '__main__':
    preprocessor = preprocess.Preprocessor()
    ratio = 0.7
    # preprocessor.visualize_data()
    preprocessor.tokenize_data()
    training_data_x = preprocessor.sequenced_summaries[:int(
        ratio * len(preprocessor.sequenced_summaries))]
    training_data_y = preprocessor.rewards[:int(ratio *
                                                len(preprocessor.
                                                    sequenced_summaries))]
    data_x = preprocessor.sequenced_summaries[int(ratio *
                                                  len(preprocessor.
                                                      sequenced_summaries)):]
    data_y = preprocessor.rewards[int(ratio *
                                      len(preprocessor.sequenced_summaries)):]
    # print("hey there")
    # for i in range(7):
Esempio n. 26
0
train_path = "../input/train_data.csv"
test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
predata_onehot = pr.Preprocessor(predata_copy).all("onehot", "nonpub")
#predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub")

#prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
#prep_test_label = predata_label.iloc[len(train):, :]

num_list = [
    "TimeToNearestStation", "TotalFloorArea", "Area", "Frontage",
    "BuildingYear", "BuildingAge", "Breadth", "CoverageRatio",
    "FloorAreaRatio", "Period"
]
predata_onehot = im.Imputer(predata_onehot).num_imputer(num_list)
print(predata_onehot[num_list].isnull().sum())

prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]],
                              axis=1)
Esempio n. 27
0
        default="")

    args = parser.parse_args()

    defines = args.defines

    if args.exclude_defines != "":
        excludes = args.exclude_defines.split()
        for ex in excludes:
            defines = defines.replace(ex, "")

    print("defines: ", defines)

    if args.cpp != "":
        cpp_pass = preprocess.Preprocessor(temp_dir=args.output_dir,
                                           cpp_cmd=args.cpp,
                                           defines=defines)
    else:
        cpp_pass = None

    headers, _ = ffv.find_files(args.vpath, args.headers)
    cxx, _ = ffv.find_files(args.vpath, args.cxx)

    # part I: we need to find the names of the Fortran routines that
    # are called from C++ so we can modify the header in the
    # corresponding *_F.H file.

    # A list of specific macros that we want to look for in each target.

    macro_list = [
        'AMREX_INT_ANYD', 'AMREX_REAL_ANYD', 'BL_TO_FORTRAN_ANYD',
Esempio n. 28
0
        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor([
        './data/drcd/DRCD_training.json', './data/drcd/DRCD_dev.json',
        './data/drcd/DRCD_training.json'
    ])

    train_c, train_q, train_y = ds.get_dataset(
        './data/drcd/DRCD_training.json')
    test_c, test_q, test_y = ds.get_dataset('./data/drcd/DRCD_dev.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=128,
                  max_features=len(ds.charset))
    bidaf.build_model()
    bidaf.model.fit([train_c, train_q],
Esempio n. 29
0
test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
#predata_onehot = pr.Preprocessor(predata).all("onehot")
predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub")

prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
prep_test_label = predata_label.iloc[len(train):, :]
""" define data"""
train_X = prep_train_label.drop(["y", "id", "Prefecture", "Municipality"],
                                axis=1)
train_y = np.log1p(prep_train_label["y"])
test_X = prep_test_label.drop(["id", "Prefecture", "Municipality"], axis=1)
""" divine data"""
train_X_tyuko = train_X[train_X["Type"] == 1]
train_X_tatemono = train_X[train_X["Type"] == 2]
train_X_toti = train_X[train_X["Type"] == 3]
train_y_tyuko = train_y[train_X_tyuko.index]
train_y_tatemono = train_y[train_X_tatemono.index]
train_y_toti = train_y[train_X_toti.index]
Esempio n. 30
0
# -*- coding: utf-8 -*-
#   Project name : Evi-Fact
#   Edit with PyCharm
#   Create by simengzhao at 2018/8/17 下午2:08
#   南京大学软件学院 Nanjing University Software Institute
#

import tensorflow as tf
import numpy as np
import json
import re
import preprocess as PP
import model
npk = PP.Preprocessor(False)
GEFG = model.gated_evidence_fact_generation()
dg = npk.data_provider(
    'train_data.json', {
        'NAME': 'GEFG',
        'MEL': GEFG.MAX_EVID_LEN,
        'MEC': GEFG.MAX_EVIDS,
        'MFL': GEFG.MAX_FACT_LEN,
        'BATCH_SIZE': 1
    })

tf.nn.dynamic_rnn()
m1 = tf.placeholder(dtype=tf.float32, shape=[5, 3, 4])
m2 = tf.placeholder(dtype=tf.float32, shape=[1, 3, 4])
r1 = m1
r2[3] = m2
# r1 = tf.reduce_sum(r1,1)
with tf.Session() as sess: