Esempi in Python per Preprocessor, esempi in Python per preprocess.Preprocessor

Esempio n. 1

0

Mostra file

File: train.py Progetto: bwallace/seq2tree-MeSH

def prep_data():
    input_texts, mesh_outputs = load_data.assemble_pairs()
    abstract_p = preprocess.Preprocessor()
    
    # preprocess and encode texts (inputs)
    abstract_p.preprocess(input_texts)
    X = abstract_p.encode_texts(input_texts)

    labels_p = preprocess.Preprocessor(vocab_size=None, split_char=".", normalize=False)
    labels_p.preprocess(mesh_outputs)
    Y = labels_p.encode_texts(mesh_outputs)

    return (input_texts, abstract_p, mesh_outputs, labels_p, list(zip(X,Y)))

Esempio n. 2

0

Mostra file

def test():
    '''
    Trains the model and returns its score
    '''
    matplotlib.rcParams['backend'] = 'Qt5Agg'
    matplotlib.get_backend()
    D = DataManager(data_name, data_dir)
    #Load le model
    mdl = model()

    Prepro = prepro.Preprocessor()
    #D.data['X_train'] = Prepro.removeOutliers(D.data['X_train'])
    #D.data['Y_train'] = Prepro.removeOutliers(D.data['Y_train'])
    X_train = D.data['X_train']
    Y_train = D.data['Y_train'].ravel()

    #test de l'entrainement
    mdl.fit(X_train, Y_train)

    #test de la prediction
    Y_hat_train = mdl.predict(D.data['X_train'])
    Y_hat_valid = mdl.predict(D.data['X_valid'])
    Y_hat_test = mdl.predict(D.data['X_test'])

    metric_name, scoring_function = get_metric()
    scores = cross_val_score(mdl,
                             X_train,
                             Y_train,
                             cv=5,
                             scoring=make_scorer(scoring_function))
    print('\nCV score (95 perc. CI): %0.2f (+/- %0.2f)' %
          (scores.mean(), scores.std() * 2))

Esempio n. 3

0

Mostra file

def jsd(fileName, query):
    pp = preprocess.Preprocessor()
    fileText = pp.prepDoc(fileName, combine = True)
    queryText = pp.preprocess(query)
    texts = [queryText, fileText]
    probDists = getProbDists(texts)
    return jensenshannon.jensen_shannon_divergence(numpy.array(probDists))

Esempio n. 4

0

Mostra file

File: test2.py Progetto: dfalessi/CCP-changes

def testFile(fileName, query):
    pp = preprocess.Preprocessor()
    fileText = pp.prepDoc(fileName, combine=True)
    #print fileText
    queryText = pp.preprocess(query)
    texts = [queryText, fileText]
    #print texts
    probDists = getProbDists(texts)
    #print probDists
    print jensenshannon.jensen_shannon_divergence(numpy.array(probDists))

Esempio n. 5

0

Mostra file

def data_prep(seed):
    profile = profile.Profile()
    interest = interest.Interest()
    preprocess = preprocess.Preprocessor()
    profile_raw = profile.get_profile()
    interest_raw, ids = interest.data_merge()
    data = preprocess.finalize_data(profile_raw, interest_raw)
    X, y, X_train, y_train, X_test, y_test = preprocess.split_data(data,
                                                                   seed=seed,
                                                                   re=False)
    return X, y, X_train, y_train, X_test, y_test, ids

Esempio n. 6

0

Mostra file

File: main.py Progetto: kienvu58/income_prediction

def preprocess_data(url, seed):
    preprocessor = preprocess.Preprocessor()
    raw_data = preprocessor.get_data(url)
    contain_null = preprocessor.get_null(raw_data)
    for f in contain_null:
        raw_data.loc[(raw_data[f].isnull()),
                     f] = preprocessor.ImputeVoteClassifier(raw_data, f)
    X_train, y_train, X_test, y_test = preprocessor.split_data(raw_data,
                                                               seed,
                                                               re=False)
    return X_train, y_train, X_test, y_test

Esempio n. 7

0

Mostra file

def main():
    preprocessor = preprocess.Preprocessor(
        '{}/words.txt'.format(cwd))  # preprocess is called
    ''' poor data sets: '''
    # preprocessor2 = preprocess.Preprocessor('{}/2-letter-words.json'.format(cwd))
    # preprocessor3 = preprocess.Preprocessor('{}/3-letter-words.json'.format(cwd))
    # preprocessor4 = preprocess.Preprocessor('{}/4-letter-words.json'.format(cwd))
    # preprocessor5 = preprocess.Preprocessor('{}/5-letter-words.json'.format(cwd))
    # preprocessor6 = preprocess.Preprocessor('{}/6-letter-words.json'.format(cwd))
    # preprocessor7 = preprocess.Preprocessor('{}/7-letter-words.json'.format(cwd))
    # preprocessor8 = preprocess.Preprocessor('{}/8-letter-words.json'.format(cwd))
    # preprocessor9 = preprocess.Preprocessor('{}/9-letter-words.json'.format(cwd))
    # preprocessor10 = preprocess.Preprocessor('{}/10-letter-words.json'.format(cwd))
    # preprocessor11 = preprocess.Preprocessor('{}/11-letter-words.json'.format(cwd))
    # preprocessor12 = preprocess.Preprocessor('{}/12-letter-words.json'.format(cwd))
    vocabpreprocessor = preprocess.Preprocessor('{}/vocab.txt'.format(cwd))
    moreWords = preprocess.Preprocessor(
        '{}/entriesWithCollocates.txt'.format(cwd))
    global wordDict
    '''preprocessor.processedWords +'''
    # wordDict = preprocessor.processedWords + preprocessor5.processedWords + preprocessor6.processedWords + preprocessor7.processedWords + preprocessor8.processedWords + preprocessor9.processedWords + preprocessor10.processedWords + preprocessor11.processedWords + preprocessor12.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords
    wordDict = preprocessor.processedWords + vocabpreprocessor.processedWords + moreWords.processedWords
    # set the word dict so the game can find the best guesses
    # print 'PROCESSED WORDS={}'.format(preprocessor.processedWords)
    game = Game()  # starts the game

    while 1:
        GUESS = game.getNextBestGuess()
        game.guess(GUESS)
    # try:
    # 	while 1:
    # 		GUESS = game.getNextBestGuess()
    # 		game.guess(GUESS)
    # except:
    # 	print '\nGAME ENDED'
    return 0

Esempio n. 8

0

Mostra file

File: main.py Progetto: kienvu58/keyword_suggestion

 def get_data(
     self,
     dsn_database,
     dsn_hostname,
     dsn_port,
     dsn_protocol,
     dsn_uid,
     dsn_pwd,
     level,
 ):
     preprocess = preprocess.Preprocessor()
     raw_data = preprocess.db2_connect(
         dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd
     )
     data = preprocess.data_preprocess(raw_data, level)
     return data

Esempio n. 9

0

Mostra file

File: compress.py Progetto: komais/boiler

    def compress(self, samFilename, compressedFilename, gtf, min_filename, frag_len_z_cutoff, split_diff_strands, split_discordant):
        ''' Compresses the alignments to 2 files, one for unspliced and one for spliced

            file_prefix: Prefix for all output file names
        '''

        self.p = preprocess.Preprocessor(samFilename, frag_len_z_cutoff, split_diff_strands)

        if not self.frag_len_cutoff:
            self.frag_len_cutoff = self.p.frag_len_cutoff
        print('Using fragment length cutoff of ' + str(self.frag_len_cutoff))

        if split_diff_strands:
            print('Splitting mates on different strands')
        else:
            print('Not splitting mates on different strands')

        if split_discordant:
            print('Splitting discordant')
        else:
            print('Not splitting discordant')

        # Reads on different strands that should be unpaired
        self.diff_strand_unpaired = self.p.unpaired
        del self.p

        # Read header
        header = ''
        with open(samFilename, 'r') as f:
            for line in f:
                if line[0] == '@':
                    header += line
                else:
                    break
        self.chromosomes = self.parseSAMHeader(header)
        self.aligned = alignments.Alignments(self.chromosomes, self.frag_len_cutoff, split_discordant)

        if gtf:
            self.aligned.gtf_exons = self.parseGTF(gtf, self.aligned.chromOffsets)

        self.compressByBundle(samFilename, compressedFilename, min_filename)

        #print('%d unmatched' % self.aligned.numUnmatched)
        print('Approximately %d / %d = %f%% of compressed file is coverage' % (self.covSize, self.totalSize, 100.0*float(self.covSize)/float(self.totalSize)))
        print('Finished compressing')

Esempio n. 10

0

Mostra file

File: Evaluate.py Progetto: chugare/EVI-fact

def gate_value_report_write(fname,evids_ids,fact_ids,gate_v):
    '''
    用于记录gate值和生成事实之间的对应关系，每一个事实对应一个生成时的最佳证据编号
    :param fname: 文件名
    :param evids_ids: 证据的id序列
    :param fact_ids:  事实id序列
    :param gate_v: 门控值
    :return:
    '''
    p = preprocess.Preprocessor(False)
    fact = p.get_char_list(fact_ids)


    evids = []
    e_w = []
    for e in evids_ids:
        if e[0] == 2:
            e_w.append(0)
            for i in range(len(e)):
                if e[i] == 1:
                    e = e[:i]
                    break
            evids.append(p.get_sentence(e))
        else:
            break
    f = open(fname,'a',encoding='utf-8')
    fact_len = 0
    for g_i in range(len(gate_v)):
        if int(fact_ids[g_i])==1:
            break
        fact_len+=1
        e_w[gate_v[g_i]]+=1
    for i in range(len(evids)):

        f.write('%d\t%s'%(e_w[i],evids[i]))
        f.write('\n')
    for g in range(fact_len):
        f.write('%d\t'%gate_v[g])
    f.write('\n')
    for f_c in fact:
        f.write(f_c+'\t')
    f.write('\n')

    f.close()

Esempio n. 11

0

Mostra file

File: antideriv.py Progetto: FelSiq/antiderivative-solution-insertion-on-images

    def __init__(self):
        """Main class for antiderivative detection."""
        app_id = 'LHLP7U-HHLKWGU3AT'.lower()

        self._wolfram_client = wolframalpha.Client(app_id)
        self.img_input = None  # type: t.Optional[np.ndarray]
        self.img_solved = None  # type: t.Optional[np.ndarray]
        self.img_segments = None  # type: t.Optional[t.Sequence[np.ndarray]]

        self.models = self._load_models(path=os.path.join(
            os.path.realpath(__file__)[:-len(os.path.basename(__file__))],
            "models"))

        self._preprocessor = preprocess.Preprocessor()
        self._postprocessor = postprocess.Postprocessor()

        # Must have correspondence with the class codification
        # used to train the CNN model loaded just above. Don't
        # change the symbol order.
        self._CLASS_SYMBOL = (
            "0",
            "1",
            "x",
            "+",
            "-",
            "/",
            "(",
            ")",
            "e",
            "integrate",
            "d",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
        )

        self._RE_FIX_DNOTATION = re.compile(r"(?<=d)\s+(?=.)")

Esempio n. 12

0

Mostra file

 def __init__(self):
     '''
     fancy_classifier = Pipeline([
                 ('preprocessing', Preprocessor()),
                 ('classification', RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=0))
                 ])
     self.clf = VotingClassifier(estimators=[
                 ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
                 ('Gaussian Classifier', GaussianNB()),
                 ('Support Vector Machine', SVC(probability=True)),
                 ('Fancy Classifier', fancy_classifier)],
                 voting='soft')   
     '''
     self.mdl = RandomForestClassifier(n_estimators=136,
                                       max_depth=None,
                                       min_samples_split=2,
                                       random_state=0)
     self.num_train_samples = 0
     self.num_feat = 1
     self.num_labels = 1
     self.prep = prepro.Preprocessor()

Esempio n. 13

0

Mostra file

File: strip_ifdefs.py Progetto: qtinsider/opera-presto

            opt_lines = 1

    if not opt_system:
        if opt_testing:
            # If we are testing just use the lingogi file
            opt_system = os.path.join('platforms', 'lingogi', 'system.h')
        else:
            err("You need to specify -s, see -h")
            sys.exit(2)

    if not os.path.exists(opt_system):
        err("'%s' does not exist" % opt_system)
        sys.exit(2)

    # First do all preprocessing from pch.h
    processor = pp.Preprocessor()
    processor.__setitem__("PRODUCT_SYSTEM_FILE", '"' + opt_system + '"')
    processor.addUserIncludePath(".")
    processor.ignoreErrors()

    # If defines are specified we parse them here
    # currently treated as one space separated string
    # and we replace \" with " in string defines.
    #
    # FIXME: The python macros store the name
    # and value, the current code is pike inherited
    # and thus redundantly stores the name twice
    #
    if opt_defines:
        for define in opt_defines:
            if len(define):

Esempio n. 14

0

Mostra file

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)

if __name__ == '__main__':
    tmp_path="D:/dhm/programer-lx/BiDAF_tf2"
    ds = preprocess.Preprocessor([
        tmp_path+'/data/squad/train-v1.1.json',
        tmp_path+'/data/squad/dev-v1.1.json',
        tmp_path+'/data/squad/dev-v1.1.json'
    ])

##    train_c, train_q, train_y = ds.get_dataset(tmp_path+'/data/squad/train-v1.1.json')
##    test_c, test_q, test_y = ds.get_dataset(tmp_path+'/data/squad/dev-v1.1.json')
    train_cc, train_cq, train_wc, train_wq, train_y = ds.get_dataset(tmp_path+'/data/squad/test.json')
    test_cc, test_cq, test_wc, test_wq, test_y = ds.get_dataset(tmp_path+'/data/squad/test.json')

    bidaf = BiDAF(
        clen=ds.max_clen,
        qlen=ds.max_qlen,
        emb_size=50,
        max_features=len(ds.charset),  # ds.charset
        vocab_size=len(ds.word_list),
        conv_layers=[[10, 1], [10, 2], [30, 3]],  # 卷积的大小及个数

Esempio n. 15

0

Mostra file

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor(
        ['./data/squad/train-v1.1.json', './data/squad/dev-v1.1.json'],
        ['./data/glove.6B/glove.6B.50d.txt'])
    '''
    train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json')
    test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json')
    

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    '''

    bidaf = BiDAF(
        clen=ds.max_clen,
        qlen=ds.max_qlen,
        emb_size=50,

Esempio n. 16

0

Mostra file

File: test.py Progetto: austinguo550/Hangman-AI

def main():
    preprocessor = preprocess.Preprocessor(
        '{}/2-letter-words.json'.format(cwd))  # preprocess is called
    print preprocessor.processedWords

Esempio n. 17

0

Mostra file

File: sklearns.py Progetto: Nurkic/probspace_youtube

test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
"""predata_onehot = pr.Preprocessor(predata).all("onehot")"""
predata_label = pr.Preprocessor(predata_copy).all("label", "date")
"""prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]], axis=1)
prep_test_onehot = predata_onehot.iloc[len(train):, :]"""

prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
prep_test_label = predata_label.iloc[len(train):, :]
"""prep_train_onehot.to_csv("../prep_train_onehot.csv", index=False)
prep_test_onehot.to_csv("../prep_test_onehot.csv", index=False)
prep_train_label.to_csv("../prep_train_label.csv", index=False)
prep_test_label.to_csv("../prep_test_label.csv", index=False)"""
""" define data"""
train_X = prep_train_label.drop([
    "y", "video_id", "thumbnail_link", "publishedAt", "collection_date", "id",
    "tags", "description", "title"
],
                                axis=1)

Esempio n. 18

0

Mostra file

File: test2.py Progetto: dfalessi/CCP-changes

def test(string1, string2):
    pp = preprocess.Preprocessor()
    texts = [pp.preprocess(string1), pp.preprocess(string2)]

    #print dictionary.token2id
    getProbDists(texts)

Esempio n. 19

0

Mostra file

File: preprocessing.py Progetto: FelSiq/antiderivative-solution-insertion-on-images

import os
import re

import numpy as np
import skimage
import imageio

sys.path.insert(0, "../antideriv")
import preprocess as antideriv_preproc # noqa: ignore


OUTPUT_PATH = "./data-augmented-preprocessed"
RE_CLASS_NAME = re.compile(r"(?<=class_)[^_]+")
OUTPUT_FILE_TYPE = "png"

PREPROCESSOR_MODEL = antideriv_preproc.Preprocessor()
"""Preprocess the training data the same way as a regular input."""


def resize(img: np.ndarray,
           output_shape: t.Tuple[int, int] = (45, 45)) -> np.ndarray:
    """Resize image to ``output_shape`` with interpolation of order 3."""
    img = skimage.transform.resize(
        image=img,
        output_shape=output_shape,
        anti_aliasing=False,
        order=3)

    return img

Esempio n. 20

0

Mostra file

File: main.py Progetto: whwangyf1234/NLP-project

        end_probability = y_pred_end[end_idx]

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor(
        ['./data/train.json', './data/dev.json', './data/test.json'])

    train_c, train_q, train_y = ds.get_dataset('./data/train.json')
    test_c, test_q, test_y = ds.get_dataset('./data/dev.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=128,
                  max_features=len(ds.charset))
    bidaf.build_model()
    bidaf.model.fit([train_c, train_q],
                    train_y,
                    batch_size=16,

Esempio n. 21

0

Mostra file

    )
    with open(input_file, 'rb') as f:
        lang_data = pickle.load(f)
    new_data = [[preprocess.preprocess_sentence(w) for w in l.split('\t')]
                for l in lang_data[:NUM_DATA]]

    label_holder = []
    input_sentences = []
    for line in new_data:
        labels = postprocess.sentence_labeller(line[0], line[1])
        label_holder.append(labels)
        input_sentences.append(line[1])

        #label_holder = np.array(label_holder)
    #Pre-process the data
    data_holder = preprocess.Preprocessor(lang_data, NUM_DATA, 'TRAIN')
    _, target_dataset, _, output_table, _, max_length_tar, _, _, _, output_index2word, target_lengths = data_holder.finalise_dataset(
    )

    train_targets, val_targets, train_labels, val_labels, train_lengths, val_lengths = train_test_split(
        target_dataset, label_holder, target_lengths, test_size=TEST_SPLIT)
    #Feeding the data in reverse order helps with training
    #input_dataset = np.flip(input_dataset)

    #Create a dataset
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        train_targets, maxlen=max_length_tar, padding='post')
    label_holder = tf.keras.preprocessing.sequence.pad_sequences(
        train_labels, maxlen=max_length_tar, padding='post')
    padded_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        train_labels, maxlen=max_length_tar, padding='post')

Esempio n. 22

0

Mostra file

File: dep.py Progetto: xyuan/amrex

                        help="output a detailed log file describing each source file",
                        action="store_true")
    parser.add_argument("files", metavar="source files", type=str, nargs="*",
                        help="F90 source files to find dependencies amongst")

    args = parser.parse_args()

    if args.prefix != "":
        prefix_pass = "******".format(os.path.normpath(args.prefix))
    else:
        prefix_pass = "******"

    if args.temp_dir != "":
        temp_dir = args.temp_dir
    else:
        temp_dir = "./"

    # create a preprocessor object
    if args.cpp != "":
        cpp_pass = preprocess.Preprocessor(temp_dir=temp_dir, cpp_cmd=args.cpp,
                                           defines=args.defines,
                                           f90_preprocess=args.f90_preprocess)
    else:
        cpp_pass = None

    try:
        doit(prefix_pass, args.search_path.split(), args.files, cpp_pass, debug=args.debug)
    except:
        # something went wrong
        print("$(error something went wrong in dep.py.  Remake, adding the option 'DEP_CHECK_OPTS=--debug' to your make command and examine the 'dependencies.out' file)")

Esempio n. 23

0

Mostra file

File: noising_model.py Progetto: emielzyde/grammar_correction_thesis

        if prediction_word == '<EOS>':
            return decoded_text, sentence, attention_matrix

        decoder_input = tf.expand_dims([prediction_id],0)

    return decoded_text, sentence, attention_matrix

if __name__ == '__main__':

    input_file = os.path.join('/Users/emielzyde/Desktop/Project/grammar_correction/lang8_preprocess.pickle')
    with open(input_file, 'rb') as f:
        #lang_data = f.readlines()
        lang_data = pickle.load(f)
        #lang_data = lang_data.readlines()
    #Pre-process the data
    data_holder = preprocess.Preprocessor(lang_data, 2000, 'TRAIN')
    input_dataset, target_dataset, input_table, output_table, max_length_inp, max_length_tar, input_word2index, output_word2index, input_index2word, output_index2word = data_holder.finalise_dataset()

    train_input_dataset, val_input_dataset, train_target_dataset, val_target_dataset = train_test_split(input_dataset, target_dataset, test_size = TEST_SPLIT)
    #Feeding the data in reverse order helps with training
    #input_dataset = np.flip(input_dataset)

    print('The vocabulary size is {}'.format(len(input_word2index)))

    #Create a dataset
    number_batches = len(train_input_dataset) // BATCH_SIZE
    input_vocab_size = len(input_table.word2index)
    target_vocab_size = len(output_table.word2index)
    dataset = tf.data.Dataset.from_tensor_slices((train_input_dataset, train_target_dataset)).shuffle(len(train_input_dataset))
    dataset = dataset.batch(BATCH_SIZE, drop_remainder= True)

Esempio n. 24

0

Mostra file

File: main.py Progetto: jiscal/kaikeba-MRC-second

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor([
        './data/squad/train-v1.1.json', './data/squad/dev-v1.1.json',
        './data/squad/dev-v1.1.json'
    ])
    train_c, train_q, train_y = ds.get_dataset('./data/squad/train-v1.1.json')
    test_c, test_q, test_y = ds.get_dataset('./data/squad/dev-v1.1.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    #加载样本中样本的GLove词向量和字符char的初始化
    train_c, train_q, train_y = ds.get_chardataset(
        './data/squad/train-v1.1.json')

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=50,
                  max_features=len(ds.charset))

Esempio n. 25

0

Mostra file

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
import time
import os
import preprocess
import LSTM
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# file = open('./figures/output1.txt', 'w')
# sys.stdout = file

if __name__ == '__main__':
    preprocessor = preprocess.Preprocessor()
    ratio = 0.7
    # preprocessor.visualize_data()
    preprocessor.tokenize_data()
    training_data_x = preprocessor.sequenced_summaries[:int(
        ratio * len(preprocessor.sequenced_summaries))]
    training_data_y = preprocessor.rewards[:int(ratio *
                                                len(preprocessor.
                                                    sequenced_summaries))]
    data_x = preprocessor.sequenced_summaries[int(ratio *
                                                  len(preprocessor.
                                                      sequenced_summaries)):]
    data_y = preprocessor.rewards[int(ratio *
                                      len(preprocessor.sequenced_summaries)):]
    # print("hey there")
    # for i in range(7):

Esempio n. 26

0

Mostra file

File: linear.py Progetto: Nurkic/probspace_re_realestate

train_path = "../input/train_data.csv"
test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
predata_onehot = pr.Preprocessor(predata_copy).all("onehot", "nonpub")
#predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub")

#prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
#prep_test_label = predata_label.iloc[len(train):, :]

num_list = [
    "TimeToNearestStation", "TotalFloorArea", "Area", "Frontage",
    "BuildingYear", "BuildingAge", "Breadth", "CoverageRatio",
    "FloorAreaRatio", "Period"
]
predata_onehot = im.Imputer(predata_onehot).num_imputer(num_list)
print(predata_onehot[num_list].isnull().sum())

prep_train_onehot = pd.concat([df, predata_onehot.iloc[:len(train), :]],
                              axis=1)

Esempio n. 27

0

Mostra file

        default="")

    args = parser.parse_args()

    defines = args.defines

    if args.exclude_defines != "":
        excludes = args.exclude_defines.split()
        for ex in excludes:
            defines = defines.replace(ex, "")

    print("defines: ", defines)

    if args.cpp != "":
        cpp_pass = preprocess.Preprocessor(temp_dir=args.output_dir,
                                           cpp_cmd=args.cpp,
                                           defines=defines)
    else:
        cpp_pass = None

    headers, _ = ffv.find_files(args.vpath, args.headers)
    cxx, _ = ffv.find_files(args.vpath, args.cxx)

    # part I: we need to find the names of the Fortran routines that
    # are called from C++ so we can modify the header in the
    # corresponding *_F.H file.

    # A list of specific macros that we want to look for in each target.

    macro_list = [
        'AMREX_INT_ANYD', 'AMREX_REAL_ANYD', 'BL_TO_FORTRAN_ANYD',

Esempio n. 28

0

Mostra file

        return (start_probability + end_probability) / 2.0

    y_true = tf.squeeze(y_true)
    y_pred_start = y_pred[:, 0, :]
    y_pred_end = y_pred[:, 1, :]

    inputs = (y_true, y_pred_start, y_pred_end)
    acc = tf.map_fn(calc_acc, inputs, dtype=tf.float32)

    return tf.math.reduce_mean(acc, axis=0)


if __name__ == '__main__':
    ds = preprocess.Preprocessor([
        './data/drcd/DRCD_training.json', './data/drcd/DRCD_dev.json',
        './data/drcd/DRCD_training.json'
    ])

    train_c, train_q, train_y = ds.get_dataset(
        './data/drcd/DRCD_training.json')
    test_c, test_q, test_y = ds.get_dataset('./data/drcd/DRCD_dev.json')

    print(train_c.shape, train_q.shape, train_y.shape)
    print(test_c.shape, test_q.shape, test_y.shape)

    bidaf = BiDAF(clen=ds.max_clen,
                  qlen=ds.max_qlen,
                  emb_size=128,
                  max_features=len(ds.charset))
    bidaf.build_model()
    bidaf.model.fit([train_c, train_q],

Esempio n. 29

0

Mostra file

File: xgb_tyuko.py Progetto: Nurkic/probspace_re_realestate

test_path = "../input/test_data.csv"
""" load raw data"""
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
""" Preprocessing"""
import preprocess as pr
import impute as im

import copy

df = train["y"]

predata = pd.concat([train.drop("y", axis=1), test], ignore_index=True)
predata_copy = copy.deepcopy(predata)
#predata_onehot = pr.Preprocessor(predata).all("onehot")
predata_label = pr.Preprocessor(predata_copy).all("label", "nonpub")

prep_train_label = pd.concat([df, predata_label.iloc[:len(train), :]], axis=1)
prep_test_label = predata_label.iloc[len(train):, :]
""" define data"""
train_X = prep_train_label.drop(["y", "id", "Prefecture", "Municipality"],
                                axis=1)
train_y = np.log1p(prep_train_label["y"])
test_X = prep_test_label.drop(["id", "Prefecture", "Municipality"], axis=1)
""" divine data"""
train_X_tyuko = train_X[train_X["Type"] == 1]
train_X_tatemono = train_X[train_X["Type"] == 2]
train_X_toti = train_X[train_X["Type"] == 3]
train_y_tyuko = train_y[train_X_tyuko.index]
train_y_tatemono = train_y[train_X_tatemono.index]
train_y_toti = train_y[train_X_toti.index]

Esempio n. 30

0

Mostra file

File: test.py Progetto: chugare/EVI-fact

# -*- coding: utf-8 -*-
#   Project name : Evi-Fact
#   Edit with PyCharm
#   Create by simengzhao at 2018/8/17 下午2:08
#   南京大学软件学院 Nanjing University Software Institute
#

import tensorflow as tf
import numpy as np
import json
import re
import preprocess as PP
import model
npk = PP.Preprocessor(False)
GEFG = model.gated_evidence_fact_generation()
dg = npk.data_provider(
    'train_data.json', {
        'NAME': 'GEFG',
        'MEL': GEFG.MAX_EVID_LEN,
        'MEC': GEFG.MAX_EVIDS,
        'MFL': GEFG.MAX_FACT_LEN,
        'BATCH_SIZE': 1
    })

tf.nn.dynamic_rnn()
m1 = tf.placeholder(dtype=tf.float32, shape=[5, 3, 4])
m2 = tf.placeholder(dtype=tf.float32, shape=[1, 3, 4])
r1 = m1
r2[3] = m2
# r1 = tf.reduce_sum(r1,1)
with tf.Session() as sess: