Python Preprocessの例、preprocess.Preprocess Pythonの例

コード例 #1

0

ファイルを表示

ファイル: main.py プロジェクト: yipersevere/PracticalMachineLearningTutorialWithPython

def main():
    # setup logging --------------------------
    logging.basicConfig(filename='plsa.log',
                        level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                        datefmt='%a, %d %b %Y %H:%M:%S')
    #console = logging.StreamHandler()
    #console.setLevel(logging.INFO)
    #logging.getLogger('').addHandler(console)
    # some basic configuration ---------------
    fname = './data.txt'
    fsw = './stopwords.txt'
    eps = 20.0
    key_word_size = 10
    # preprocess -----------------------------
    pp = PP(fname, fsw)
    w_d = pp.get_w_d()
    V, D = w_d.shape
    logging.info('V = %d, D = %d' % (V, D))
    # train model and get result -------------
    pmodel = PLSA()
    for z in range(3, (D+1), 10):
        t1 = time.clock()
        (l, p_d_z, p_w_z, p_z) = pmodel.train(w_d, z, eps)
        t2 = time.clock()
        logging.info('z = %d, eps = %f, time = %f' % (z, l, t2-t1))
        for itz in range(z):
            logging.info('Topic %d' % itz)
            data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:,itz]))]
            data.sort(key=lambda tup:tup[0], reverse=True)
            for i in range(key_word_size):
                logging.info('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))

コード例 #2

0

ファイルを表示

ファイル: testPreprocess.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

	def testPreprocessWithElongatedWords(self):
		tweet = "dili kaayu klaro imuha :( HAHAHAHA haaaaaaaaaaaaaaays! dapat ipa zoom ang nawong pa more HAHAHAHA :P"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["dili kaayo klaro imuha", [":("]], ["hahahaha haays", []], ["dapat ipa zoom ang nawong pa more "
		                                                                       "hahahaha", [":P"]]]
		self.assertEqual(result, expected)

コード例 #3

0

ファイルを表示

ファイル: create_trainning.py プロジェクト: hoangcongst/Coreference-SVM

    def __init__(self):
        cursor.execute("SELECT content FROM data")
        scripts = cursor.fetchall()

        fw = open('vector.txt', 'w')
        fresult = open('result.txt', 'w')
        mPreprocess = Preprocess()
        mPairedToken = PairToken()
        mConvertVector = ConvertVector()
        stanford = StanfordCoreNLP('http://localhost:9000')
        for script in scripts:
            # if type(script) is tuple:
            listToken = mPreprocess.exec(script[0])
            # else:
            #     listToken = mPreprocess.exec(script)

            listCouple = mPairedToken.exec(listToken)

            output = stanford.annotate(script[0], properties={'annotators': 'coref', 'outputFormat': 'json'})

            for mCoupleToken in listCouple:
                if self.checkCoreF(output['corefs'], mCoupleToken):
                    # fresult.write(str(1) + ' ' + mCoupleToken.np1.text + '  ' + mCoupleToken.np2.text)
                    fresult.write(str(1))
                    fresult.write('\n')
                else:
                    # fresult.write(str(-1) + ' ' + mCoupleToken.np1.text + '  ' + mCoupleToken.np2.text)
                    fresult.write(str(-1))
                    fresult.write('\n')

                vector = mConvertVector.exec(mCoupleToken)

                fw.write(str(vector))
                fw.write('\n')

コード例 #4

0

ファイルを表示

def main():
    # setup logging --------------------------
    logging.basicConfig(filename='plsa.log',
                        level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                        datefmt='%a, %d %b %Y %H:%M:%S')
    #console = logging.StreamHandler()
    #console.setLevel(logging.INFO)
    #logging.getLogger('').addHandler(console)
    # some basic configuration ---------------
    fname = './data.txt'
    fsw = './stopwords.txt'
    eps = 20.0
    key_word_size = 10
    # preprocess -----------------------------
    pp = PP(fname, fsw)
    w_d = pp.get_w_d()
    V, D = w_d.shape
    logging.info('V = %d, D = %d' % (V, D))
    # train model and get result -------------
    pmodel = PLSA()
    for z in range(3, (D+1), 10):
        t1 = time.clock()
        (l, p_d_z, p_w_z, p_z) = pmodel.train(w_d, z, eps)
        t2 = time.clock()
        #logging.info('z = %d, eps = %f, time = %f' % (z, l, t2-t1))
        #print ('z = %d, eps = %f, time = %f' % (z, l, t2-t1))
        for itz in range(z):
            logging.info('Topic %d' % itz)
            data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:,itz]))]
            data.sort(key=lambda tup:tup[0], reverse=True)
            for i in range(key_word_size):
                logging.info('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))
                print ('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))

コード例 #5

0

ファイルを表示

def main():
    '''Training of the model on the preprocessed data. '''

    preprocess = Preprocess()
    data = preprocess.getData(
        path="creditcard.csv",  # path of the csv file
        feature_incides=[0, 29],  # column indices of the features
        label_indices=[30],  # column indices of the labels
        training_size=0.5,  # size for the training set 
        standardize=True,  # apply standardization?
        eval_set=True  # create evaluation set?
    )

    model = Model(
        batch_size=10,  # size of the training batch  
        epochs=50,  # number of training epochs  
        nodes=[
            29, 200, 2
        ],  # List of neurons, first entry is the number of input, last entry 
        # the number of output neurons. The values in between are the hidden neurons.
        learning_rate=0.0001,  # learning rate for the training
        hidden_activation=
        "sigmoid",  # activation function for the hidden nodes, choose between "tanh", "sigmoid" and "relu"
        output_activation=
        "linear",  # activation function for the output nodes, choose between "tanh", "sigmoid" and "linear" 
        data=data,  # the loaded and preprocessed data form the csv file
        do_eval=True  # measure accuracy of the evaluation set?
    )

    model.train()

コード例 #6

0

ファイルを表示

 def __init__(self, data_path):
     self.path = data_path
     self.preprocess = Preprocess()
     self.gender = []
     self.userid = []
     self.model = Model()
     self.model.load_model()

コード例 #7

0

ファイルを表示

def LoadData():
	print("Preprocess the dataset...", end = ' ')
	preprocess = Preprocess()
	SRC, TRG, tr, valid, ts = preprocess.Build()
	print("DONE")
	
	return SRC,TRG, tr, valid, ts

コード例 #8

0

ファイルを表示

ファイル: Main.py プロジェクト: HollowTube/Reddit_Karma_predictor

    def closed_form_extra_features(self):
        preprocess1 = Preprocess()

        x_set = preprocess1.matrixify(self.data, 60)
        y_set = Preprocess.get_y(self.data)
        lengths = []
        length_squared = []

        for datapoint in self.data:
            text_length = len(datapoint['text'])
            lengths.append(text_length)

        children_length_inter = []
        children_list = []
        log_children_list = []

        for datapoint in self.data:
            children_list.append(datapoint['children'])
            if datapoint['children'] != 0:
                log_children_list.append(math.log(datapoint['children']))
            else:
                log_children_list.append(0)

        for length, children in zip(lengths, children_list):
            children_length_inter.append(length * children)

        # preprocess1.add_features(children_length_inter)
        # x_set = preprocess1.add_features(log_children_list)
        # x_set = feature_selector.backwardElimination(x_set, y_set, 0.5)
        return self.run_model(x_set, y_set)

コード例 #9

0

ファイルを表示

    def __init__(self):
        self.vect = TfidfVectorizer()

        self.data = None
        self.vect_data = None

        self.pre = Preprocess()

コード例 #10

0

ファイルを表示

 def display_training_and_validation_error(self):
     num_words = 160
     word_nums = np.arange(num_words)
     val_error_list = []
     train_error_list = []
     preprocess1 = Preprocess()
     x_set = preprocess1.matrixify(self.data, num_words)
     y_set = Preprocess.get_y(self.data)
     for x in word_nums:
         cur = x_set[:, 3:3 + x]
         print("Running on top " + str(x) + " words")
         val_error, train_error = self.run_model(cur, y_set)
         val_error_list.append(val_error)
         train_error_list.append(train_error)
     fig, ax = plt.subplots()
     plt.scatter(word_nums,
                 val_error_list,
                 color='blue',
                 s=5,
                 label="Validation set")
     plt.scatter(word_nums,
                 train_error_list,
                 color='red',
                 s=5,
                 label="Training set")
     plt.title("MSE vs number of words used")
     ax.set_xlabel("Words Used")
     ax.set_ylabel("MSE")
     plt.legend(loc='upper right')
     plt.show()

コード例 #11

0

ファイルを表示

    def getConstraints(self, setnumber=""):
        try:
            preprocess = Preprocess()
            absolute_path = path.join(self._path, self._data['params'])
            count = []
            constraints = []
            for filename in listdir(absolute_path):
                match = re.match(self._patterns['params'], filename)
                if match:
                    if (match.group(2) == setnumber):
                        count.append(match.group(3))
                        with open(path.join(absolute_path, filename),
                                  "r") as c:
                            constraints.append(
                                preprocess.preprocessConstraints(
                                    c.read().split("\n")))
            if constraints == []:
                return {"error": True, "message": "Something's up"}
            return {"error": False, "constraints": constraints, "count": count}
        except FileNotFoundError:
            return {
                "error":
                True,
                "message":
                """Files not found. Please make sure that there is a directory called 'params' 
				in the given path, with the files named as params.txt or params1.txt or params1-1.txt"""
            }

コード例 #12

0

ファイルを表示

ファイル: HOG.py プロジェクト: massi92/DIP-Project_MGFF

	def elabora(self,path):
		#print path
		img = cv2.imread(path,0)
		
		#Trasformazione
		pp = Preprocess()
		img = pp.applyTransform(img,300,300)

		descriptorValues = []
		locations = []
		#hd = cv2.HOGDescriptor((32,64), (16,16), (8,8), (8,8), 9)
		#hd = cv2.HOGDescriptor()
		hd = cv2.HOGDescriptor((16,16), (16,16), (8,8), (8,8),9)
		#hd = cv2.HOGDescriptor()
		#print "Lunghezza di hd: "+str(len(hd))
		res = hd.compute(img)		
		#ls = res.tolist()
		#print str(len(ls))
		"""
		des = res[0]

		for i in xrange(1,len(res)):
			des = np.concatenate([des,res[i]])
		"""
		return res.ravel()
		#print ls
		#print str(len(res))
		#for i in range(0,len(ls)):
		#	des = des+ls[i]
		print "#####################"

コード例 #13

0

ファイルを表示

ファイル: mainui.py プロジェクト: GopalSeshadri/RecommendationSystem

def loadData():
    '''
    This function loads the data from various data files and does the basic preprocessing.
    Created to leverage the power of streamlit cache.
    '''
    movies_df = Preprocess.loadFile("movies")
    ratings_df = Preprocess.loadFile("ratings")
    final_vector_df = Util.loadObj('final_vector_df')
    embeddings_matrix = final_vector_df.loc[:, final_vector_df.
                                            columns != 'movieId']
    embedding_movie_list = final_vector_df['movieId'].tolist()

    ratings_df2 = Preprocess.loadFile("ratings")
    # ratings_input =  [ratings_df['userId'].to_numpy(), ratings_df['movieId'].to_numpy(), ratings_df['rating'].to_numpy()]
    users = list(set(ratings_df['userId'].tolist()))
    movies = list(set(ratings_df['movieId'].tolist()))

    users_dict = {u: i for i, u in enumerate(users)}
    movies_dict = {m: i for i, m in enumerate(movies)}  # Movie Id to Idx
    movies_idx_dict = {i: m for i, m in enumerate(movies)}  #Idx to movie Id

    ratings_df2['userId'] = ratings_df2['userId'].apply(
        lambda x: users_dict[x])
    ratings_df2['movieId'] = ratings_df2['movieId'].apply(
        lambda x: movies_dict[x])

    return movies_df, ratings_df, final_vector_df, embeddings_matrix, embedding_movie_list, ratings_df2, users, movies, users_dict, movies_dict, movies_idx_dict

コード例 #14

0

ファイルを表示

    def train():
        from gensim.models import word2vec
        from preprocess import Preprocess, API_download
        import glob

        #train_dataから学習に用いるテキストを選択
        docs = []
        pathlist = glob.glob("../data/train_data/*")
        for path in pathlist:
            f = open(path)
            text = f.read()
            f.close()
            docs.append(text)

        #テキストの前準備
        tagger = API_download.mecab_download()
        word_lists = []
        for doc in docs:
            text = Preprocess.cleaning_text(doc)
            word_class = Preprocess.mecab_list(text, tagger)
            word_list = []
            for word in word_class:
                word_list.append(word[0])
            word_lists.append(word_list)
        #学習をさせ、モデルを作る
        model = word2vec.Word2Vec(word_lists,
                                  size=200,
                                  min_count=1,
                                  window=5,
                                  iter=100)
        return model

コード例 #15

0

ファイルを表示

ファイル: debug.py プロジェクト: ptrus/vietoris_rips_image_classification

def debug(folders, n_components, r = None, max_dimension = 1):
    X,y = load_dataset(folders)
    p = Preprocess(n_components)
    X = p.fit_transform(X)
    
    if r is None:
        distances = PairwiseDistances(X.tolist())
        distances = ExplicitDistances(distances)
        n_samples = len(X)
        r_candidates = sorted(set(np.array(distances.distances).flatten()))
        for r2 in r_candidates:
            print r2
            cx = vietoris_rips(X.tolist(), max_dimension, r2)
            cords = mds_plot(X, y)
            lines_plot(cx, cords)
            plt.show()
    else:
        cx = vietoris_rips(X.tolist(), max_dimension, r)
        actual_max_dimension = len(max(cx, key=len)) - 1
        for d in range(actual_max_dimension, 2, -1):
            sx_d = filter_simplices(cx, d)
            print "dimension", d, ":", len(sx_d), "simplices"
            for i, sx in enumerate(sx_d):
                print i, "..."
                cords = mds_plot(X, y)
                edges = list(combinations(sx, 2))
                lines_plot(edges, cords, color=np.random.rand(3,))
                plt.show()

コード例 #16

0

ファイルを表示

ファイル: utils.py プロジェクト: Lipairui/textgo

class Tokenizer():
    def __init__(self, word_level=False, preprocess=True, lang='zh'):
        self.tp = Preprocess(lang=lang)
        self.word_level = word_level
        self.preprocess = preprocess
        self.lang = lang

    def tokenize_str(self, x):
        if self.preprocess:
            if self.word_level:
                x = self.tp.preprocess([x])[0]
            else:
                x = self.tp.clean([x],drop_space=True)[0]
        if self.word_level:
            tokens =  x.split(' ')
        else:
            tokens = [t for t in x]
        return tokens
            
    def __call__(self, X):
        if type(X) is str:
            return self.tokenize_str(X)
        else:
            tokens_list = []
            for x in X:
                tokens_list.append(self.tokenize_str(x))
            return tokens_list

コード例 #17

0

ファイルを表示

    def database():
        from preprocess import Preprocess, API_download
        import glob

        #docsにテキストの集合が、id2docにテキスト名が入る
        docs = []
        id2doc = []
        pathlist = glob.glob("../data/comparison_data/*")

        for path in pathlist:
            f = open(path)
            text = f.read()
            f.close()
            docs.append(text)
            id2doc.append(path)
        print(id2doc)

        #docsには文章のリストが入っている
        tagger = API_download.mecab_download()
        word_lists = []
        for doc in docs:
            text = Preprocess.cleaning_text(doc)
            word_class = Preprocess.mecab_list(text, tagger)
            noun_list = Preprocess.noun_extract(word_class)
            noun_list2 = Preprocess.noun_squeeze(noun_list)
            word_lists.append(noun_list2)
        return word_lists, id2doc

コード例 #18

0

ファイルを表示

ファイル: analysis.py プロジェクト: ptrus/vietoris_rips_image_classification

def prepare_data(dataset, pca_n):
    global n_classes, X, y, pp, X_tr, X_inv
    n_classes = len(dataset)
    X, y = load_dataset(dataset)
    pp = Preprocess(pca_n)
    X_tr = pp.fit_transform(X)
    X_inv = pp.inverse_transform(X_tr)

コード例 #19

0

ファイルを表示

ファイル: main.py プロジェクト: neerajkhandelwal/Movie-ter

 def preprocessing(self, method='zagibolov'):
     preprocess = Preprocess(method, self.lexicons, self.negatives, self.stopWords)
     for data in self.corpus:
         preprocess.preprocess(data)
         lexicons = preprocess.lexicons
         self.lexicons = dict(self.lexicons.items() + lexicons.items())
     self.seeds = preprocess.seeds

コード例 #20

0

ファイルを表示

ファイル: main.py プロジェクト: Xannke/tweet-classification

def main(model_num=1):

    preprocess = Preprocess()

    texts_train, labels_train = preprocess.preprocessData(
        '../projet2/train.txt', mode="train")
    texts_dev, labels_dev = preprocess.preprocessData('../projet2/dev.txt',
                                                      mode="train")

    MAX_SEQUENCE_LENGTH = 24
    LSTM_DIM = 64
    HIDDEN_LAYER_DIM = 30
    NUM_CLASSES = 4
    GAUSSIAN_NOISE = 0.1
    DROPOUT = 0.2
    DROPOUT_LSTM = 0.2
    BATCH_SIZE = 200

    X_train, X_val, y_train, y_val = train_test_split(texts_train,
                                                      labels_train,
                                                      test_size=0.2,
                                                      random_state=42)

    labels_categorical_train = to_categorical(np.asarray(y_train))
    labels_categorical_val = to_categorical(np.asarray(y_val))
    labels_categorical_dev = to_categorical(np.asarray(labels_dev))

    embedding = Embedding('../projet2/emosense.300d.txt')
    embeddings = embedding.getMatrix()
    tokenizer = embedding.getTokenizer()

    message_first_message_train, message_second_message_train, message_third_message_train = get_sequences(
        X_train, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_val, message_second_message_val, message_third_message_val = get_sequences(
        X_val, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequences(
        texts_dev, MAX_SEQUENCE_LENGTH, tokenizer)

    model = CustomModel(model_num)
    model.build(embeddings,
                MAX_SEQUENCE_LENGTH,
                LSTM_DIM,
                HIDDEN_LAYER_DIM,
                NUM_CLASSES,
                noise=GAUSSIAN_NOISE,
                dropout_lstm=DROPOUT_LSTM,
                dropout=DROPOUT)
    model.summary()
    history = model.train(message_first_message_train,
                          message_second_message_train,
                          message_third_message_train,
                          labels_categorical_train, message_first_message_val,
                          message_second_message_val,
                          message_third_message_val, labels_categorical_val)

    y_pred = model.predict([
        message_first_message_dev, message_second_message_dev,
        message_third_message_dev
    ])

コード例 #21

0

ファイルを表示

ファイル: main.py プロジェクト: rpaudel42/Glassdoor

def main():

    preprocess = Preprocess()
    preprocess.check_data_distribution()

    print "\n\n*********** ANALYSIS PART I *******************"
    partI_classifier = Classifiers(1)
    partI_classifier.draw_auc_curve(1)

コード例 #22

0

ファイルを表示

ファイル: model.py プロジェクト: gracemoon/machine-learning

 def dataPreprocess(self,path):
     self.preprocess=Preprocess()
     self.preprocess.reader(path)
     # 划分数据集
     self.train_data,self.test_data,self.train_labels,self.test_labels=train_test_split(self.preprocess.sentences,self.preprocess.labels
                                                                 ,test_size=0.3)
     print(self.train_labels[:100])
     self.xgb_train=xgb.DMatrix(np.array(self.train_data),label=np.array(self.train_labels))
     self.xgb_test=xgb.DMatrix(np.array(self.test_data))

コード例 #23

0

ファイルを表示

ファイル: test_matrixify.py プロジェクト: HollowTube/Reddit_Karma_predictor

 def test_remove_non_alpha(self):
     preprocessor = Preprocess()
     preprocessor.preprocess_remove_non_alpha(self.data)
     for point in self.data:
         for word in point['text']:
             try:
                 self.assertTrue(word.isalpha())
             except AssertionError:
                 print(word)

コード例 #24

0

ファイルを表示

    def test__check_is_list(self):

        df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv',
                                      header=True)

        Preprocess(df_labels=df_long, columns=['country', 'protein'])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=df_long, columns='protein')

コード例 #25

0

ファイルを表示

ファイル: main.py プロジェクト: m-pektas/Face-Recognition-with-FaceNet-and-OpenCV

    def __init__(self):
        """initialize dataset and load model"""
        self.model = load_model(config.model_path)
        print("[Log] Pretrained model was loaded.")

        self.preprocess = Preprocess(database_path=config.database_path)
        print("[Log] Preprocess object was created.")

        self.database = self.init_database()

コード例 #26

0

ファイルを表示

ファイル: str_generator.py プロジェクト: JuniorPan/MatchZoo

 def unify_terms(self,_term_list):
     '''
     unify terms in the term list
     @param _term_mat: [[term]]
     @return: [[term]]
     '''
     _term_list = Preprocess.word_seg([_term_list,],self._word_seg_config)[0] # fit in input type of [[term]]
     _term_list = Preprocess.word_lower([_term_list,])[0]
     return _term_list

コード例 #27

0

ファイルを表示

ファイル: buildFeaVec.py プロジェクト: fpsluozi/CSE5523MachineLearningProj

    def build_feature_matrix(self, file_name):
        # with open('dataset/test.csv', 'rb') as f:
        with open(file_name, 'rb') as f:
            p = Preprocess()

            # "results" contains preprocessed tweet
            # word list contains all distinct words in training data
            results = []; word_list = []

            # dataM contains every tweet's feature vector
            # cataM contains every tweet's catagory vector
            dataM = []; cataM = []

            # read training data
            reader = csv.reader(f)
            first = True
            
            # read stop words from file
            stop_words = p.get_stop_word_list('stopWords.txt')

            for row in reader:
                if first:
                    first = False
                    continue

                # the 16th column is about tweet
                processed_tweet = p.basic_process(row[15])

                # feature_vector = getFeatureVector(processed_tweet)
                feature_vector, word_list = p.get_fea_vector_and_wordlist(processed_tweet, word_list, stop_words)

                # record feature vector for each tweet
                results.append(feature_vector)

                # record category for each vector
                cataM.append(self.get_category(row[5]))

            word_list = sorted(word_list)

            for i in range(len(results)):
                # combine feature vector and category together
                dataM.append(self.data_matrix(results[i], word_list) + cataM[i])

        f.close()

        # write feature matrix to the file
        with open('featureMatrix.csv', 'wb') as fp:
            writer = csv.writer(fp)
            # write world list into '.csv' file
            writer.writerow(word_list)

            # write feature number matrix into '.csv' file
            for row in dataM:
                writer.writerow(row)

        fp.close()

コード例 #28

0

ファイルを表示

ファイル: doc_handle.py プロジェクト: airlab-unsri/tsm

    def get_txt(self):
        pp = Preprocess()
        # open doc original
        filename = self.files[0].replace('\n', '')
        # print filename
        with open(filename, 'r') as txt:
            txt = txt.read().replace('\n', ' ').replace('\r', ' ')
            txt = pp.prep_text(txt)

        return ((filename, txt))

コード例 #29

0

ファイルを表示

ファイル: cnn_model.py プロジェクト: cheungs01/teslatothemoon

def get_data(test_prob=0.2, on_rnn_set = False, use_twitter=True):
    # FIXME: Just uncomment this stuff out
    if on_rnn_set:
        conn = sqlite3.connect("../data/rnn_data.db")
        data = pd.read_sql("SELECT * FROM RNNData", conn).to_numpy()
        data_without_date = data[:,1:].astype(np.float32)
        total_points = np.shape(data_without_date)[0]
        train_data = data_without_date[:int((1-test_prob)*total_points)]
        test_data = data_without_date[int((1-test_prob)*total_points):]
        train_prices = train_data[1:,3]
        test_prices = test_data[1:,3]
        train_data = train_data[:-1]
        test_data = test_data[:-1]
        return train_data, test_data, train_prices, test_prices
    else:
        # Preprocess the actual data
        # Parsing actual data that we will use
        STOCK_DATABASE_PATH = "../data/stock_data.db"
        RNN_DATABASE_PATH = "../data/rnn_data.db"
        x = Preprocess(STOCK_DATABASE_PATH, RNN_DATABASE_PATH)
        numpy_data, df_data, numpy_vanilla_rnn_data, df_vanilla_rnn_data = x.get_data()
    
        X = df_data[["Open", "High", "Low", "Close", "Adj Close", "Volume", "Twitter Score"]]
    
        # Shift to get the previous data as next time step X
        X[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] = X[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].shift(-1)
        y =  df_data["Close"]
    
        # Remove last step that now has a NaN in shifted values
        X = X[:-1]
        y = y[:-1]
    
        # Manually make test_size last 20 percent
        dataset_size = len(X)
        train_prob = 1 - test_prob
        split_point = int(np.round(dataset_size * train_prob))
    
        train_data = X.iloc[:split_point,:]
        test_data = X.iloc[split_point:, :]
        train_prices = y[:split_point]
        test_prices = y[split_point:]
    
        # If want to use without the twitter data for BASELINE MODEL
        if not use_twitter:
            train_data = train_data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]]
            test_data = test_data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]]
    
        # Convert out of dataframes for use in numpy
        train_data = train_data.to_numpy().astype(np.float32)
        test_data = test_data.to_numpy().astype(np.float32)
        train_prices = train_prices.to_numpy().astype(np.float32)
        test_prices = test_prices.to_numpy().astype(np.float32)
    
    
        return train_data, test_data, train_prices, test_prices

コード例 #30

0

ファイルを表示

ファイル: test_matrixify.py プロジェクト: HollowTube/Reddit_Karma_predictor

 def test_preprocess(self):
     preprocessor = Preprocess()
     preprocessor.preprocess(self.data)
     for point in self.data:
         self.assertTrue((point['controversiality'] == 0) or (point['controversiality'] == 1))
         self.assertEqual(len(point), 5)
         for word in point['text']:
             try:
                 self.assertTrue(not word.isalpha() or word.islower())
             except AssertionError:
                 print(word)

コード例 #31

0

ファイルを表示

    def test__check_nulls_in_index_column(self):

        df_nulls = self.spark.read.csv(
            'tests/fixtures/preprocess/nulls_recipe_id.csv', header=True)
        df_no_nulls = self.spark.read.csv(
            'tests/fixtures/preprocess/no_nulls_recipe_id.csv', header=True)

        Preprocess(df_labels=df_no_nulls, columns=[''])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=df_nulls, columns=[''])

コード例 #32

0

ファイルを表示

    def test__check_is_spark_data_frame(self):

        df_simple_table = self.spark.read.csv(
            'tests/fixtures/preprocess/simple_table.csv', header=True)
        pd_df_simple_table = pd.read_csv(
            'tests/fixtures/preprocess/simple_table.csv')

        Preprocess(df_labels=df_simple_table, columns=[''])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=pd_df_simple_table, columns=[''])

コード例 #33

0

ファイルを表示

 def search(self, query):
     augf = AnalyticUUIDGeneratorFactory()
     aug = augf.create()
     results = []
     for query1 in return_search_results(query.rawQuery):
         query1 = SearchQuery(type=SearchType.SENTENCES,
                              terms=query1.split(" "),
                              rawQuery=query1,
                              k=500)
         result = self.other.search(query1)
         # logging.info(result.searchResultItems)
         results.extend(result.searchResultItems)
     # results = SearchResult(searchResultItems=results, searchQuery=query)
     # logging.info(len(results))
     resultsDict = {}
     for result in results:
         resultsDict[result.sentenceId.uuidString] = result
     results = []
     for key in resultsDict:
         results.append(resultsDict[key])
     # results = results[:10] # comment out on full run
     comm_ids_list, temp = get_comm_ids(results)
     dictUUID = fetch_dataset(comm_ids_list, temp)
     inv_map = {v: k for k, v in dictUUID.items()}
     toHannah = []
     for uuid in dictUUID:
         toHannah.append([query.rawQuery, dictUUID[uuid]])
     resultItemRet = SearchResult(uuid=aug.next(),
                                  searchQuery=query,
                                  searchResultItems=results,
                                  metadata=AnnotationMetadata(
                                      tool="search",
                                      timestamp=int(time.time())),
                                  lang="eng")
     model = pickle.load(open("./trained_model.p", "rb"))
     pre = Preprocess()
     feature_matrix = pre.process_run(toHannah)
     dictRanks = pre_ranking(feature_matrix, model, toHannah, inv_map)
     results = rerank(dictRanks, resultItemRet)
     resultArr = results.searchResultItems
     resultArr = sorted(resultArr,
                        key=lambda result: result.score,
                        reverse=True)
     for item in resultArr:
         logging.info(item.score)
     resultItemRet = SearchResult(uuid=aug.next(),
                                  searchQuery=query,
                                  searchResultItems=resultArr,
                                  metadata=AnnotationMetadata(
                                      tool="search",
                                      timestamp=int(time.time())),
                                  lang="eng")
     return resultItemRet

コード例 #34

0

ファイルを表示

ファイル: topicModel.py プロジェクト: gracemoon/machine-learning

class TopicModel(object):
    def dataPreprocess(self,path):
        self.preprocess=Preprocess()
        self.preprocess.reader(path)

    def train(self):
        self.lda=LdaModel(self.preprocess.corpus,id2word=self.preprocess.dictionary,num_topics=10)
        for topic in self.lda.print_topics(num_topics=10,num_words=10):
            print(topic[1])

    def evaluation(self):
        pass

コード例 #35

0

ファイルを表示

ファイル: main.py プロジェクト: GopalSeshadri/MemoryNetwork

    def saveModels():
        '''
        This function opens the tarfile, preprocess the data, train models on it and it
        saves the model in the Models directory.
        '''
        tar = tarfile.open('Data/babi_tasks_1-20_v1-2.tar.gz')

        challenges = {
          # QA1 with 10,000 samples
          'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
          # QA2 with 10,000 samples
          'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
        }

        ## Single Supporting Fact Challenge
        ss_train_stories, ss_test_stories, \
            ss_stories_train, ss_questions_train, ss_answers_train, \
            ss_stories_test, ss_questions_test, ss_answers_test, \
            ss_story_maxlen, ss_story_maxsents, ss_question_maxlen, \
            ss_vocab, ss_vocab_size, ss_word2idx = \
            Preprocess.getData(challenges['single_supporting_fact_10k'], tar)

        ss_idx2word = {value : key for key, value in ss_word2idx.items()}

        single_model, single_debug_model = \
            Models.singleModel(ss_story_maxlen, ss_story_maxsents, ss_question_maxlen, ss_vocab_size, \
                            ss_stories_train, ss_questions_train, ss_answers_train, \
                            ss_stories_test, ss_questions_test, ss_answers_test, \
                            EMBEDDING_DIM, NUM_EPOCHS, BATCH_SIZE)

        Utilities.saveModel(single_model, 'single_model')
        Utilities.saveModel(single_debug_model, 'single_debug_model')

        ## Two Supporting Fact challenge
        ts_train_stories, ts_test_stories, \
            ts_stories_train, ts_questions_train, ts_answers_train, \
            ts_stories_test, ts_questions_test, ts_answers_test, \
            ts_story_maxlen, ts_story_maxsents, ts_question_maxlen, \
            ts_vocab, ts_vocab_size, ts_word2idx = \
            Preprocess.getData(challenges['two_supporting_facts_10k'], tar)

        ts_idx2word = {value : key for key, value in ts_word2idx.items()}

        double_model, double_debug_model = \
            Models.doubleModel(ts_story_maxlen, ts_story_maxsents, ts_question_maxlen, ts_vocab_size, \
                            ts_stories_train, ts_questions_train, ts_answers_train, \
                            ts_stories_test, ts_questions_test, ts_answers_test, \
                            EMBEDDING_DIM, NUM_EPOCHS_2, BATCH_SIZE)

        Utilities.saveModel(double_model, 'double_model')
        Utilities.saveModel(double_debug_model, 'double_debug_model')

コード例 #36

0

ファイルを表示

def main():

    preprocess = Preprocess(data_file, nrows)
    taxi_summary, L, A, T, p_pick, p_tran, r, t_drive, t_wait = preprocess.preprocess_data()

    print("\n\nFeature Generation Completed .....")
    print("\n\n ---- Top 10 rows ---- \n\n", taxi_summary.head())

    prediction = Prediction()

    prediction.MDP_Dynamic_Program(L, A, T, p_pick, p_tran, r, t_drive, t_wait)

    print("\n\nStarting Revenue Prediction .....")
    prediction.predict_revenue(taxi_summary)

コード例 #37

0

ファイルを表示

ファイル: seq2seq.py プロジェクト: ComingLeaves/chatbot

    def __init__(self):
        print("tensorflow version: ", tf.__version__)

        self.dict_file = 'data/word_dict.txt'
        self.data_map = "data/map.pkl"  # pkl是cpickle模块生成的文件，用于长久保存字符串、列表、字典等数据

        self.batch_size = 20  # 每次喂进20个
        self.max_epoch = 10000  # 最大100000轮
        self.show_batch = 1  #
        self.model_path = 'model/'
        # jieba导入词典
        jieba.load_userdict(self.dict_file)

        self.location = ["杭州", "重庆", "上海", "北京"]
        self.user_info = {"__UserName__": "yw", "__Location__": "重庆"}
        self.robot_info = {"__RobotName__": "xw"}

        # 获取输入输出
        if os.path.isfile(self.data_map):
            with open(self.data_map, "rb") as f:
                data_map = cPickle.load(
                    f)  # 使用cpickle读取map.pkl文件内容返回，注意写入是什么类型数据，读取是就是什么类型数据
        else:
            p = Preprocess()
            p.main()  # 如果不存在data_map则调用Preprocess()方法重新创建向量和map
            data_map = p.data_map  # data_map是全局变量的dict，在这里可以取到

        # 从data_map中查找各个键值对并赋值,其中也存在字典嵌套
        self.encoder_vocab = data_map.get("Q_vocab")
        self.encoder_vec = data_map.get("Q_vec")
        self.encoder_vocab_size = data_map.get("Q_vocab_size")
        self.char_to_vec = self.encoder_vocab

        self.decoder_vocab = data_map.get("A_vocab")
        self.decoder_vec = data_map.get("A_vec")
        self.decoder_vocab_size = data_map.get("A_vocab_size")
        self.vec_to_char = {v: k for k, v in self.decoder_vocab.items()}

        print("encoder_vocab_size {}".format(self.encoder_vocab_size))
        print("decoder_vocab_size {}".format(self.decoder_vocab_size))
        # 调用DynamicSeq2seq()方法，将编码解码词典长度导入，初始化模型
        self.model = DynamicSeq2seq(
            encoder_vocab_size=self.encoder_vocab_size + 1,
            decoder_vocab_size=self.decoder_vocab_size + 1,
        )
        #优先给程序分配显存
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.restore_model()

コード例 #38

0

ファイルを表示

ファイル: stock.py プロジェクト: pmiller10/stock_price

    def norm5(cls, data):
        data = [[d] for d in data]
        print "deviation ", Preprocess.standard_deviation(data)
        # data = Preprocess.scale(data)
        # print 'deviation ', Preprocess.standard_deviation(data)
        data = [(d[0] * 10) + 0.5 for d in data]

        return data

コード例 #39

0

ファイルを表示

ファイル: main.py プロジェクト: davelab6/incremental-fonts

def main(args):
  """Main program to run preprocessing of the font Arguments:

    font-file
    --hinting=(False|True)  ,default is false
  """
  options = Options()
  args = options.parse_opts(args, ignore_unknown=True)
  if len(args) < 1:
    print('usage: ./pyprepfnt font-file [--option=value]...', file=sys.stderr)
    sys.exit(1)

  fontfile = args[0]
  args = args[1:]

  filename, extension = os.path.splitext(fontfile)

  cleanfile = filename + '_clean' + extension
  cleanup.cleanup(fontfile, False, cleanfile)

  closure.dump_closure_map(cleanfile, '.')

  preprocess = Preprocess(cleanfile, '.')
  preprocess.base_font()
  preprocess.cmap_dump()
  preprocess.serial_glyphs()

コード例 #40

0

ファイルを表示

ファイル: auto_encoder.py プロジェクト: sanketrahul/cs-412_ml_course_project

def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    preprocesser = Preprocess()
    data= preprocesser.read()

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'Auto Encoder')
    ae = AutoEncoder(
    	layers=[
		Layer("Sigmoid",units=100)
               ],
	learning_rate=0.01,
	n_iter=40,
        verbose=True,
    )
    ae.fit(data[:,:-1])

    print '[INFO, time: %s] Transforming Data with %s ...' % (time.strftime('%H:%M:%S'), 'Auto Encoder')
    splitRatio = 0.67
    train, test = splitDataset(data, splitRatio)
    train = np.asarray(train)
    test = np.asarray(test)
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    transformed_trainX = ae.transform(trainX)
    transformed_testX = ae.transform(testX)

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters')
    clf = SVC()
    clf.fit(transformed_trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(transformed_testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))

コード例 #41

0

ファイルを表示

ファイル: svm_gitpreprocess_rbf.py プロジェクト: sanketrahul/cs-412_ml_course_project

def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    preprocesser = Preprocess()
    data= preprocesser.read()

    splitRatio = 0.67
    train, test = splitDataset(data, splitRatio)
    train = np.asarray(train)
    test = np.asarray(test)

    trainX = train[:,:-1]
    trainy = train[:,-1]

    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters')
    clf = SVC()
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))

コード例 #42

0

ファイルを表示

ファイル: gradientboosting_gitpreprocess_rbf.py プロジェクト: sanketrahul/cs-412_ml_course_project

def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    preprocesser = Preprocess()
    data= preprocesser.read()

    splitRatio = 0.67
    train, test = splitDataset(data, splitRatio)
    train = np.asarray(train)
    test = np.asarray(test)

    trainX = train[:,:-1]
    trainy = train[:,-1]

    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'Gradient Boosting Classifier with 300 estimators')
    clf = GradientBoostingClassifier(n_estimators=300)
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))

コード例 #43

0

ファイルを表示

ファイル: main.py プロジェクト: neerajkhandelwal/Movie-ter

 def scoring(self, method='zagibolov'):
     # Supply argument in Corpus to connect to databse. user, password and db.
     corpus = Corpus(password='', db='project_major')
     corpus.getTweets()
     dataset = corpus.dataSet
     preprocess = Preprocess('zagibolov', self.lexicons, self.negatives, self.stopWords)
     scoring = Scoring(method, self.lexicons, self.negatives, self.stopWords, self.seeds)
     j = 0
     for data in dataset:
         preprocess.preprocessScoring(data)
         processed = preprocess.processed_data
         
     for data in processed:
         scoring.count(data['tweet'])
 ##        print self.seeds
     preprocess.seeds = scoring.lexicon_count
     preprocess.processLexicon()
     scoring.lexicons = preprocess.lexicons
 ##        print scoring.lexicon_count
     last_score = {}
     i = 0
     for i in range(0,3):
         total = 0
         j = 0
         negative = 0
         positive = 0
         scoring.resetLexiconCount()
 ##        print self.lexicons
         for data in processed:
             if j == 50:
                 break
             j += 1
             score = scoring.score(data)
             if score != 0:
                 total += 1
                 if score < 0:
                     negative += 1
                 else:
                     positive += 1
         scoring.adjustScoring()
         if last_score == {}:
             last_score = scoring.lexicons
             this_score = last_score
         else:
             this_score = scoring.lexicons
             if this_score == last_score:
                 break
             else:
                 last_score = this_score
         print this_score
         print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
     print this_score
     print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive

コード例 #44

0

ファイルを表示

ファイル: stock.py プロジェクト: pmiller10/stock_price

    def norm4(cls, data):
        data = [[d] for d in data]
        print "deviation ", Preprocess.standard_deviation(data)
        data = [d[0] for d in data]

        data = Preprocess.root(data, 2)
        data = Preprocess.squeeze(data)
        data = Preprocess.squeeze(data)

        # data = [[d] for d in data]
        # data = Preprocess.scale(data)
        # data = [d[0] for d in data]

        data = [[d] for d in data]
        print "deviation ", Preprocess.standard_deviation(data)
        # data = preprocessing.normalize(data, norm='l2')
        data = Preprocess.norm(data)
        print "deviation ", Preprocess.standard_deviation(data)
        data = [d[0] - 0.04 for d in data]
        data = [round(d, 1) for d in data]

        return data

コード例 #45

0

ファイルを表示

ファイル: testPreprocess.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

	def testPreprocessWithAllCases(self):
		tweet = "Won't be sleeping for an #overnight with @meyaan. This is going to be a looooooooooooooong night :( #thesis"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["not sleeping overnight", []], ["going loong night", [":("]]]
		self.assertEqual(result, expected)

コード例 #46

0

ファイルを表示

ファイル: dists.py プロジェクト: DesignInformaticsLab/EcoRacer2016

__author__ = 'Thurston'

from ego import Kriging
from preprocess import Preprocess
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_player_one(2)

unit_sig = np.ones(31)

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)
#
# # get sigma estimate that maximizes the sum of expected improvements

import scipy.optimize as opt
all_sigs = np.zeros((len(pre.full_tab['id'].tolist()), 31))
all_improv = np.zeros_like(pre.full_tab['id'].tolist())
lb = 0.01
ub = 100.
bounds = [(lb, ub)]*31

for n, i in enumerate(pre.full_tab['id'].tolist()):

    a, b = pre.prep_by_id(i)

コード例 #47

0

ファイルを表示

ファイル: testPreprocess.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

	def testPreprocessNormalTweet(self):
		tweet = "Thinking trying social media management tool? Test drive Sprout Social free today!"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["thinking trying social media management tool", []], ["test drive sprout social free today", []]]
		self.assertEqual(result, expected)

コード例 #48

0

ファイルを表示

ファイル: runner.py プロジェクト: chrinide/DNN

    mean = float(sum(dif))/len(targets)
    root = math.sqrt(mean)
    return root

""" Average of a list """
def avg(array):
    return float(sum(array))/len(array)
        

boston = datasets.load_boston()
#boston = datasets.make_regression()
#data = boston[0]
#target = boston[1]
data = boston.data
target = boston.target
matrix = Preprocess.to_matrix(list(data))
matrix = Preprocess.scale(matrix)
matrix = list(matrix)
target = list(target)
layers = [13,7,1]

dnn = DNN(matrix, target, layers, hidden_layer="TanhLayer", final_layer="LinearLayer", compression_epochs=5, smoothing_epochs=0, bias=True)
full = dnn.fit()
print full
#preds = [dnn.predict(d)[0] for d in matrix]
preds = [full.activate(d)[0] for d in matrix]

print "mrse preds {0}".format(mrse(preds, target))
print "rmse preds {0}".format(rmse(preds, target))

#mean = avg(target)

コード例 #49

0

ファイルを表示

ファイル: testPreprocess.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

	def testPreprocessTweetWithMidHashtag(self):
		tweet = "New always-on #AndroidWear apps keep info handy for when you are on the go"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["new always-on androidwear apps keep info handy when go", []]]
		self.assertEqual(result, expected)

コード例 #50

0

ファイルを表示

ファイル: testPreprocess.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

	def testPreprocessTweetWithEndHashtag(self):
		tweet = "THANKS FOR ALL THE QUESTIONS DURING THIS GAB! KEEP VOTING USING #ChoiceSciFiTVActress"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["thanks questions during gab", []], ["keep voting using", []]]
		self.assertEqual(result, expected)

コード例 #51

0

ファイルを表示

ファイル: solBad.py プロジェクト: DesignInformaticsLab/EcoRacer2016

import os.path
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import json


from ego import Kriging
from preprocess import Preprocess
from reg import CovarianceEstimate
import numpy as np

# #
# # get data from the game
# # delete the parameters if performing first-time or new player.
# # Parameters are there to speed up after saving a pkl.
pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_bad_player()

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)

########
# X, y = X[:12], y[:12]
########

コード例 #52

0

ファイルを表示

ファイル: sol2.py プロジェクト: DesignInformaticsLab/EcoRacer2016

import os.path
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import json


from ego import Kriging
from preprocess import Preprocess
from reg import CovarianceEstimate
import numpy as np

# #
# # get data from the game
# # delete the parameters if performing first-time or new player.
# # Parameters are there to speed up after saving a pkl.
pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_player_one(2)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)

########
n_trajectory = 12
X, y = X[:n_trajectory], y[:n_trajectory] # only use the first few plays
########

コード例 #53

0

ファイルを表示

ファイル: runner.py プロジェクト: pmiller10/handwriting_classification

from image import Image
from preprocess import Preprocess
from classifier import Classifier
from log_loss import log_loss
from postprocess import PostProcess

genders = Image.genders()
d, _ = Image.data()
matrix = Preprocess.to_matrix(d)
print matrix.shape
matrix = Preprocess.remove_constants(matrix)
print matrix.shape
matrix = Preprocess.scale(matrix)
matrix = Preprocess.polynomial(matrix, 2)
matrix = Preprocess.scale(matrix)
print matrix.shape
matrix = matrix.tolist()
half = len(matrix)/2
train, cv = matrix[:half], matrix[half:]
train_genders, cv_genders = genders[:half], genders[half:]
cv_genders = cv_genders[0::4]
preds = Classifier.ensemble_preds(train, train_genders, cv)
print "Score: ", log_loss(preds, cv_genders)

コード例 #54

0

ファイルを表示

ファイル: testPreprocess.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

	def testPreprocessWithMentions(self):
		tweet = "Take a trip to Central City with @MiloVentimiglia and The PET Squad"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["take trip central city pet squad", []]]
		self.assertEqual(result, expected)

コード例 #55

0

ファイルを表示

ファイル: testPreprocess.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

	def testPreprocessWithContractions(self):
		tweet = "this isn't real! :("
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["not real", []], ["", [":("]]]
		self.assertEqual(result, expected)

コード例 #56

0

ファイルを表示

ファイル: main.py プロジェクト: tbsexton/HashStream

"""
Example run, which processes an input at 60s rolling intervals,
caclulates mean degree for all intervals, and outputs degrees to
a file.

Call in terminal from root like:
    $ python src/main.py /path/to/input.txt /path/to/output.txt

Equivalent to:
    $ ./run.sh
if the input is /tweet_input/tweets.txt and the output is
/tweet_output/output.txt
"""

# def main():


if __name__ == '__main__':
    import sys
    print(sys.argv)

    pre = Preprocess(sys.argv[1])
    pre.extract()

    graph_gen = rolled_graph_gen(pre.df)

    degrees = g_stats(graph_gen, mean_deg, savename=sys.argv[2])

コード例 #57

0

ファイルを表示

ファイル: cal_L_INI.py プロジェクト: DesignInformaticsLab/EcoRacer2016

import json


from ego import Kriging
from preprocess import Preprocess
from reg import CovarianceEstimate
import numpy as np
from scipy.spatial.distance import pdist, cdist, squareform
from pyDOE import lhs
from scipy.misc import logsumexp
import scikits.bootstrap as boot
# #
# # get data from the game
# # delete the parameters if performing first-time or new player.
# # Parameters are there to speed up after saving a pkl.
pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_player_one(2) # MAX: first dimension is number of plays, second is solution space dimension

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)

total_no_iters = 31
n_trial = 1
dim = 31
bounds = np.array([[-1.,1.]]*31)

コード例 #58

0

ファイルを表示

ファイル: interface.py プロジェクト: wnxhaja/sentimentanalysis_ajlk

from preprocess import Preprocess
from featureExtract import FeatureExtract
from afinnClassifier import Afinn
from emoticons import Emoticons

global prep

iter = 1
print("Welcome!!!\n")
while iter == 1:
	tweet = raw_input("Please enter a tweet to be analyzed: ")
	prep = Preprocess(tweet) #load Preprocess class

	#preprocess input data
	data = prep.preprocess()
	print data

	#generate bigrams from preprocessed text
	for item in data:
		if not item[0]:
			item[0] = None
		else:
			bigrams = FeatureExtract(item[0]).getBigrams
			item[0] = bigrams
	print data
	
	A = Afinn()
	A.classify(data)

コード例 #59

0

ファイルを表示

ファイル: prepare_mz_data.py プロジェクト: RuijieRa/MatchZoo

    dstdir = './'

    infiles = [ srcdir + 'WikiQA-mz-train.txt', srcdir + 'WikiQA-mz-dev.txt', srcdir + 'WikiQA-mz-test.txt']
    corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus(infiles[0], infiles[1], infiles[2])
    print('total corpus : %d ...' % (len(corpus)))
    print('total relation-train : %d ...' % (len(rel_train)))
    print('total relation-valid : %d ...' % (len(rel_valid)))
    print('total relation-test: %d ...' % (len(rel_test)))
    prepare.save_corpus(dstdir + 'corpus.txt', corpus)

    prepare.save_relation(dstdir + 'relation_train.txt', rel_train)
    prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid)
    prepare.save_relation(dstdir + 'relation_test.txt', rel_test)
    print('Preparation finished ...')

    preprocessor = Preprocess(word_stem_config={'enable': False}, word_filter_config={'min_freq': 2})
    dids, docs = preprocessor.run(dstdir + 'corpus.txt')
    preprocessor.save_word_dict(dstdir + 'word_dict.txt', True)
    preprocessor.save_words_stats(dstdir + 'word_stats.txt', True)

    fout = open(dstdir + 'corpus_preprocessed.txt', 'w')
    for inum, did in enumerate(dids):
        fout.write('%s %s %s\n' % (did, len(docs[inum]), ' '.join(map(str, docs[inum]))))
    fout.close()
    print('Preprocess finished ...')

    # dssm_corp_input = dstdir + 'corpus_preprocessed.txt'
    # dssm_corp_output = dstdir + 'corpus_preprocessed_dssm.txt'
    word_dict_input = dstdir + 'word_dict.txt'
    triletter_dict_output = dstdir + 'triletter_dict.txt'
    word_triletter_output = dstdir + 'word_triletter_map.txt'

コード例 #60

0

ファイルを表示

ファイル: driver.py プロジェクト: p-kar/charIdent

from preprocess import Preprocess

p = Preprocess()
p.preprocessDirectory()