def main():
    # setup logging --------------------------
    logging.basicConfig(filename='plsa.log',
                        level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                        datefmt='%a, %d %b %Y %H:%M:%S')
    #console = logging.StreamHandler()
    #console.setLevel(logging.INFO)
    #logging.getLogger('').addHandler(console)
    # some basic configuration ---------------
    fname = './data.txt'
    fsw = './stopwords.txt'
    eps = 20.0
    key_word_size = 10
    # preprocess -----------------------------
    pp = PP(fname, fsw)
    w_d = pp.get_w_d()
    V, D = w_d.shape
    logging.info('V = %d, D = %d' % (V, D))
    # train model and get result -------------
    pmodel = PLSA()
    for z in range(3, (D+1), 10):
        t1 = time.clock()
        (l, p_d_z, p_w_z, p_z) = pmodel.train(w_d, z, eps)
        t2 = time.clock()
        logging.info('z = %d, eps = %f, time = %f' % (z, l, t2-t1))
        for itz in range(z):
            logging.info('Topic %d' % itz)
            data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:,itz]))]
            data.sort(key=lambda tup:tup[0], reverse=True)
            for i in range(key_word_size):
                logging.info('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))
	def testPreprocessWithElongatedWords(self):
		tweet = "dili kaayu klaro imuha :( HAHAHAHA haaaaaaaaaaaaaaays! dapat ipa zoom ang nawong pa more HAHAHAHA :P"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["dili kaayo klaro imuha", [":("]], ["hahahaha haays", []], ["dapat ipa zoom ang nawong pa more "
		                                                                       "hahahaha", [":P"]]]
		self.assertEqual(result, expected)
    def __init__(self):
        cursor.execute("SELECT content FROM data")
        scripts = cursor.fetchall()

        fw = open('vector.txt', 'w')
        fresult = open('result.txt', 'w')
        mPreprocess = Preprocess()
        mPairedToken = PairToken()
        mConvertVector = ConvertVector()
        stanford = StanfordCoreNLP('http://localhost:9000')
        for script in scripts:
            # if type(script) is tuple:
            listToken = mPreprocess.exec(script[0])
            # else:
            #     listToken = mPreprocess.exec(script)

            listCouple = mPairedToken.exec(listToken)

            output = stanford.annotate(script[0], properties={'annotators': 'coref', 'outputFormat': 'json'})

            for mCoupleToken in listCouple:
                if self.checkCoreF(output['corefs'], mCoupleToken):
                    # fresult.write(str(1) + ' ' + mCoupleToken.np1.text + '  ' + mCoupleToken.np2.text)
                    fresult.write(str(1))
                    fresult.write('\n')
                else:
                    # fresult.write(str(-1) + ' ' + mCoupleToken.np1.text + '  ' + mCoupleToken.np2.text)
                    fresult.write(str(-1))
                    fresult.write('\n')

                vector = mConvertVector.exec(mCoupleToken)

                fw.write(str(vector))
                fw.write('\n')
Beispiel #4
0
def main():
    # setup logging --------------------------
    logging.basicConfig(filename='plsa.log',
                        level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                        datefmt='%a, %d %b %Y %H:%M:%S')
    #console = logging.StreamHandler()
    #console.setLevel(logging.INFO)
    #logging.getLogger('').addHandler(console)
    # some basic configuration ---------------
    fname = './data.txt'
    fsw = './stopwords.txt'
    eps = 20.0
    key_word_size = 10
    # preprocess -----------------------------
    pp = PP(fname, fsw)
    w_d = pp.get_w_d()
    V, D = w_d.shape
    logging.info('V = %d, D = %d' % (V, D))
    # train model and get result -------------
    pmodel = PLSA()
    for z in range(3, (D+1), 10):
        t1 = time.clock()
        (l, p_d_z, p_w_z, p_z) = pmodel.train(w_d, z, eps)
        t2 = time.clock()
        #logging.info('z = %d, eps = %f, time = %f' % (z, l, t2-t1))
        #print ('z = %d, eps = %f, time = %f' % (z, l, t2-t1))
        for itz in range(z):
            logging.info('Topic %d' % itz)
            data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:,itz]))]
            data.sort(key=lambda tup:tup[0], reverse=True)
            for i in range(key_word_size):
                logging.info('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))
                print ('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))
Beispiel #5
0
def main():
    '''Training of the model on the preprocessed data. '''

    preprocess = Preprocess()
    data = preprocess.getData(
        path="creditcard.csv",  # path of the csv file
        feature_incides=[0, 29],  # column indices of the features
        label_indices=[30],  # column indices of the labels
        training_size=0.5,  # size for the training set 
        standardize=True,  # apply standardization?
        eval_set=True  # create evaluation set?
    )

    model = Model(
        batch_size=10,  # size of the training batch  
        epochs=50,  # number of training epochs  
        nodes=[
            29, 200, 2
        ],  # List of neurons, first entry is the number of input, last entry 
        # the number of output neurons. The values in between are the hidden neurons.
        learning_rate=0.0001,  # learning rate for the training
        hidden_activation=
        "sigmoid",  # activation function for the hidden nodes, choose between "tanh", "sigmoid" and "relu"
        output_activation=
        "linear",  # activation function for the output nodes, choose between "tanh", "sigmoid" and "linear" 
        data=data,  # the loaded and preprocessed data form the csv file
        do_eval=True  # measure accuracy of the evaluation set?
    )

    model.train()
Beispiel #6
0
 def __init__(self, data_path):
     self.path = data_path
     self.preprocess = Preprocess()
     self.gender = []
     self.userid = []
     self.model = Model()
     self.model.load_model()
Beispiel #7
0
def LoadData():
	print("Preprocess the dataset...", end = ' ')
	preprocess = Preprocess()
	SRC, TRG, tr, valid, ts = preprocess.Build()
	print("DONE")
	
	return SRC,TRG, tr, valid, ts
    def closed_form_extra_features(self):
        preprocess1 = Preprocess()

        x_set = preprocess1.matrixify(self.data, 60)
        y_set = Preprocess.get_y(self.data)
        lengths = []
        length_squared = []

        for datapoint in self.data:
            text_length = len(datapoint['text'])
            lengths.append(text_length)

        children_length_inter = []
        children_list = []
        log_children_list = []

        for datapoint in self.data:
            children_list.append(datapoint['children'])
            if datapoint['children'] != 0:
                log_children_list.append(math.log(datapoint['children']))
            else:
                log_children_list.append(0)

        for length, children in zip(lengths, children_list):
            children_length_inter.append(length * children)

        # preprocess1.add_features(children_length_inter)
        # x_set = preprocess1.add_features(log_children_list)
        # x_set = feature_selector.backwardElimination(x_set, y_set, 0.5)
        return self.run_model(x_set, y_set)
Beispiel #9
0
    def __init__(self):
        self.vect = TfidfVectorizer()

        self.data = None
        self.vect_data = None

        self.pre = Preprocess()
Beispiel #10
0
 def display_training_and_validation_error(self):
     num_words = 160
     word_nums = np.arange(num_words)
     val_error_list = []
     train_error_list = []
     preprocess1 = Preprocess()
     x_set = preprocess1.matrixify(self.data, num_words)
     y_set = Preprocess.get_y(self.data)
     for x in word_nums:
         cur = x_set[:, 3:3 + x]
         print("Running on top " + str(x) + " words")
         val_error, train_error = self.run_model(cur, y_set)
         val_error_list.append(val_error)
         train_error_list.append(train_error)
     fig, ax = plt.subplots()
     plt.scatter(word_nums,
                 val_error_list,
                 color='blue',
                 s=5,
                 label="Validation set")
     plt.scatter(word_nums,
                 train_error_list,
                 color='red',
                 s=5,
                 label="Training set")
     plt.title("MSE vs number of words used")
     ax.set_xlabel("Words Used")
     ax.set_ylabel("MSE")
     plt.legend(loc='upper right')
     plt.show()
Beispiel #11
0
    def getConstraints(self, setnumber=""):
        try:
            preprocess = Preprocess()
            absolute_path = path.join(self._path, self._data['params'])
            count = []
            constraints = []
            for filename in listdir(absolute_path):
                match = re.match(self._patterns['params'], filename)
                if match:
                    if (match.group(2) == setnumber):
                        count.append(match.group(3))
                        with open(path.join(absolute_path, filename),
                                  "r") as c:
                            constraints.append(
                                preprocess.preprocessConstraints(
                                    c.read().split("\n")))
            if constraints == []:
                return {"error": True, "message": "Something's up"}
            return {"error": False, "constraints": constraints, "count": count}
        except FileNotFoundError:
            return {
                "error":
                True,
                "message":
                """Files not found. Please make sure that there is a directory called 'params' 
				in the given path, with the files named as params.txt or params1.txt or params1-1.txt"""
            }
Beispiel #12
0
	def elabora(self,path):
		#print path
		img = cv2.imread(path,0)
		
		#Trasformazione
		pp = Preprocess()
		img = pp.applyTransform(img,300,300)

		descriptorValues = []
		locations = []
		#hd = cv2.HOGDescriptor((32,64), (16,16), (8,8), (8,8), 9)
		#hd = cv2.HOGDescriptor()
		hd = cv2.HOGDescriptor((16,16), (16,16), (8,8), (8,8),9)
		#hd = cv2.HOGDescriptor()
		#print "Lunghezza di hd: "+str(len(hd))
		res = hd.compute(img)		
		#ls = res.tolist()
		#print str(len(ls))
		"""
		des = res[0]

		for i in xrange(1,len(res)):
			des = np.concatenate([des,res[i]])
		"""
		return res.ravel()
		#print ls
		#print str(len(res))
		#for i in range(0,len(ls)):
		#	des = des+ls[i]
		print "#####################"	
def loadData():
    '''
    This function loads the data from various data files and does the basic preprocessing.
    Created to leverage the power of streamlit cache.
    '''
    movies_df = Preprocess.loadFile("movies")
    ratings_df = Preprocess.loadFile("ratings")
    final_vector_df = Util.loadObj('final_vector_df')
    embeddings_matrix = final_vector_df.loc[:, final_vector_df.
                                            columns != 'movieId']
    embedding_movie_list = final_vector_df['movieId'].tolist()

    ratings_df2 = Preprocess.loadFile("ratings")
    # ratings_input =  [ratings_df['userId'].to_numpy(), ratings_df['movieId'].to_numpy(), ratings_df['rating'].to_numpy()]
    users = list(set(ratings_df['userId'].tolist()))
    movies = list(set(ratings_df['movieId'].tolist()))

    users_dict = {u: i for i, u in enumerate(users)}
    movies_dict = {m: i for i, m in enumerate(movies)}  # Movie Id to Idx
    movies_idx_dict = {i: m for i, m in enumerate(movies)}  #Idx to movie Id

    ratings_df2['userId'] = ratings_df2['userId'].apply(
        lambda x: users_dict[x])
    ratings_df2['movieId'] = ratings_df2['movieId'].apply(
        lambda x: movies_dict[x])

    return movies_df, ratings_df, final_vector_df, embeddings_matrix, embedding_movie_list, ratings_df2, users, movies, users_dict, movies_dict, movies_idx_dict
Beispiel #14
0
    def train():
        from gensim.models import word2vec
        from preprocess import Preprocess, API_download
        import glob

        #train_dataから学習に用いるテキストを選択
        docs = []
        pathlist = glob.glob("../data/train_data/*")
        for path in pathlist:
            f = open(path)
            text = f.read()
            f.close()
            docs.append(text)

        #テキストの前準備
        tagger = API_download.mecab_download()
        word_lists = []
        for doc in docs:
            text = Preprocess.cleaning_text(doc)
            word_class = Preprocess.mecab_list(text, tagger)
            word_list = []
            for word in word_class:
                word_list.append(word[0])
            word_lists.append(word_list)
        #学習をさせ、モデルを作る
        model = word2vec.Word2Vec(word_lists,
                                  size=200,
                                  min_count=1,
                                  window=5,
                                  iter=100)
        return model
def debug(folders, n_components, r = None, max_dimension = 1):
    X,y = load_dataset(folders)
    p = Preprocess(n_components)
    X = p.fit_transform(X)
    
    if r is None:
        distances = PairwiseDistances(X.tolist())
        distances = ExplicitDistances(distances)
        n_samples = len(X)
        r_candidates = sorted(set(np.array(distances.distances).flatten()))
        for r2 in r_candidates:
            print r2
            cx = vietoris_rips(X.tolist(), max_dimension, r2)
            cords = mds_plot(X, y)
            lines_plot(cx, cords)
            plt.show()
    else:
        cx = vietoris_rips(X.tolist(), max_dimension, r)
        actual_max_dimension = len(max(cx, key=len)) - 1
        for d in range(actual_max_dimension, 2, -1):
            sx_d = filter_simplices(cx, d)
            print "dimension", d, ":", len(sx_d), "simplices"
            for i, sx in enumerate(sx_d):
                print i, "..."
                cords = mds_plot(X, y)
                edges = list(combinations(sx, 2))
                lines_plot(edges, cords, color=np.random.rand(3,))
                plt.show()
Beispiel #16
0
class Tokenizer():
    def __init__(self, word_level=False, preprocess=True, lang='zh'):
        self.tp = Preprocess(lang=lang)
        self.word_level = word_level
        self.preprocess = preprocess
        self.lang = lang

    def tokenize_str(self, x):
        if self.preprocess:
            if self.word_level:
                x = self.tp.preprocess([x])[0]
            else:
                x = self.tp.clean([x],drop_space=True)[0]
        if self.word_level:
            tokens =  x.split(' ')
        else:
            tokens = [t for t in x]
        return tokens
            
    def __call__(self, X):
        if type(X) is str:
            return self.tokenize_str(X)
        else:
            tokens_list = []
            for x in X:
                tokens_list.append(self.tokenize_str(x))
            return tokens_list
Beispiel #17
0
    def database():
        from preprocess import Preprocess, API_download
        import glob

        #docsにテキストの集合が、id2docにテキスト名が入る
        docs = []
        id2doc = []
        pathlist = glob.glob("../data/comparison_data/*")

        for path in pathlist:
            f = open(path)
            text = f.read()
            f.close()
            docs.append(text)
            id2doc.append(path)
        print(id2doc)

        #docsには文章のリストが入っている
        tagger = API_download.mecab_download()
        word_lists = []
        for doc in docs:
            text = Preprocess.cleaning_text(doc)
            word_class = Preprocess.mecab_list(text, tagger)
            noun_list = Preprocess.noun_extract(word_class)
            noun_list2 = Preprocess.noun_squeeze(noun_list)
            word_lists.append(noun_list2)
        return word_lists, id2doc
def prepare_data(dataset, pca_n):
    global n_classes, X, y, pp, X_tr, X_inv
    n_classes = len(dataset)
    X, y = load_dataset(dataset)
    pp = Preprocess(pca_n)
    X_tr = pp.fit_transform(X)
    X_inv = pp.inverse_transform(X_tr)
Beispiel #19
0
 def preprocessing(self, method='zagibolov'):
     preprocess = Preprocess(method, self.lexicons, self.negatives, self.stopWords)
     for data in self.corpus:
         preprocess.preprocess(data)
         lexicons = preprocess.lexicons
         self.lexicons = dict(self.lexicons.items() + lexicons.items())
     self.seeds = preprocess.seeds
Beispiel #20
0
def main(model_num=1):

    preprocess = Preprocess()

    texts_train, labels_train = preprocess.preprocessData(
        '../projet2/train.txt', mode="train")
    texts_dev, labels_dev = preprocess.preprocessData('../projet2/dev.txt',
                                                      mode="train")

    MAX_SEQUENCE_LENGTH = 24
    LSTM_DIM = 64
    HIDDEN_LAYER_DIM = 30
    NUM_CLASSES = 4
    GAUSSIAN_NOISE = 0.1
    DROPOUT = 0.2
    DROPOUT_LSTM = 0.2
    BATCH_SIZE = 200

    X_train, X_val, y_train, y_val = train_test_split(texts_train,
                                                      labels_train,
                                                      test_size=0.2,
                                                      random_state=42)

    labels_categorical_train = to_categorical(np.asarray(y_train))
    labels_categorical_val = to_categorical(np.asarray(y_val))
    labels_categorical_dev = to_categorical(np.asarray(labels_dev))

    embedding = Embedding('../projet2/emosense.300d.txt')
    embeddings = embedding.getMatrix()
    tokenizer = embedding.getTokenizer()

    message_first_message_train, message_second_message_train, message_third_message_train = get_sequences(
        X_train, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_val, message_second_message_val, message_third_message_val = get_sequences(
        X_val, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequences(
        texts_dev, MAX_SEQUENCE_LENGTH, tokenizer)

    model = CustomModel(model_num)
    model.build(embeddings,
                MAX_SEQUENCE_LENGTH,
                LSTM_DIM,
                HIDDEN_LAYER_DIM,
                NUM_CLASSES,
                noise=GAUSSIAN_NOISE,
                dropout_lstm=DROPOUT_LSTM,
                dropout=DROPOUT)
    model.summary()
    history = model.train(message_first_message_train,
                          message_second_message_train,
                          message_third_message_train,
                          labels_categorical_train, message_first_message_val,
                          message_second_message_val,
                          message_third_message_val, labels_categorical_val)

    y_pred = model.predict([
        message_first_message_dev, message_second_message_dev,
        message_third_message_dev
    ])
Beispiel #21
0
def main():

    preprocess = Preprocess()
    preprocess.check_data_distribution()

    print "\n\n*********** ANALYSIS PART I *******************"
    partI_classifier = Classifiers(1)
    partI_classifier.draw_auc_curve(1)
Beispiel #22
0
 def dataPreprocess(self,path):
     self.preprocess=Preprocess()
     self.preprocess.reader(path)
     # 划分数据集
     self.train_data,self.test_data,self.train_labels,self.test_labels=train_test_split(self.preprocess.sentences,self.preprocess.labels
                                                                 ,test_size=0.3)
     print(self.train_labels[:100])
     self.xgb_train=xgb.DMatrix(np.array(self.train_data),label=np.array(self.train_labels))
     self.xgb_test=xgb.DMatrix(np.array(self.test_data))
 def test_remove_non_alpha(self):
     preprocessor = Preprocess()
     preprocessor.preprocess_remove_non_alpha(self.data)
     for point in self.data:
         for word in point['text']:
             try:
                 self.assertTrue(word.isalpha())
             except AssertionError:
                 print(word)
Beispiel #24
0
    def test__check_is_list(self):

        df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv',
                                      header=True)

        Preprocess(df_labels=df_long, columns=['country', 'protein'])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=df_long, columns='protein')
    def __init__(self):
        """initialize dataset and load model"""
        self.model = load_model(config.model_path)
        print("[Log] Pretrained model was loaded.")

        self.preprocess = Preprocess(database_path=config.database_path)
        print("[Log] Preprocess object was created.")

        self.database = self.init_database()
Beispiel #26
0
 def unify_terms(self,_term_list):
     '''
     unify terms in the term list
     @param _term_mat: [[term]]
     @return: [[term]]
     '''
     _term_list = Preprocess.word_seg([_term_list,],self._word_seg_config)[0] # fit in input type of [[term]]
     _term_list = Preprocess.word_lower([_term_list,])[0]
     return _term_list
    def build_feature_matrix(self, file_name):
        # with open('dataset/test.csv', 'rb') as f:
        with open(file_name, 'rb') as f:
            p = Preprocess()

            # "results" contains preprocessed tweet
            # word list contains all distinct words in training data
            results = []; word_list = []

            # dataM contains every tweet's feature vector
            # cataM contains every tweet's catagory vector
            dataM = []; cataM = []

            # read training data
            reader = csv.reader(f)
            first = True
            
            # read stop words from file
            stop_words = p.get_stop_word_list('stopWords.txt')

            for row in reader:
                if first:
                    first = False
                    continue

                # the 16th column is about tweet
                processed_tweet = p.basic_process(row[15])

                # feature_vector = getFeatureVector(processed_tweet)
                feature_vector, word_list = p.get_fea_vector_and_wordlist(processed_tweet, word_list, stop_words)

                # record feature vector for each tweet
                results.append(feature_vector)

                # record category for each vector
                cataM.append(self.get_category(row[5]))

            word_list = sorted(word_list)

            for i in range(len(results)):
                # combine feature vector and category together
                dataM.append(self.data_matrix(results[i], word_list) + cataM[i])

        f.close()

        # write feature matrix to the file
        with open('featureMatrix.csv', 'wb') as fp:
            writer = csv.writer(fp)
            # write world list into '.csv' file
            writer.writerow(word_list)

            # write feature number matrix into '.csv' file
            for row in dataM:
                writer.writerow(row)

        fp.close()
Beispiel #28
0
    def get_txt(self):
        pp = Preprocess()
        # open doc original
        filename = self.files[0].replace('\n', '')
        # print filename
        with open(filename, 'r') as txt:
            txt = txt.read().replace('\n', ' ').replace('\r', ' ')
            txt = pp.prep_text(txt)

        return ((filename, txt))
Beispiel #29
0
def get_data(test_prob=0.2, on_rnn_set = False, use_twitter=True):
    # FIXME: Just uncomment this stuff out
    if on_rnn_set:
        conn = sqlite3.connect("../data/rnn_data.db")
        data = pd.read_sql("SELECT * FROM RNNData", conn).to_numpy()
        data_without_date = data[:,1:].astype(np.float32)
        total_points = np.shape(data_without_date)[0]
        train_data = data_without_date[:int((1-test_prob)*total_points)]
        test_data = data_without_date[int((1-test_prob)*total_points):]
        train_prices = train_data[1:,3]
        test_prices = test_data[1:,3]
        train_data = train_data[:-1]
        test_data = test_data[:-1]
        return train_data, test_data, train_prices, test_prices
    else:
        # Preprocess the actual data
        # Parsing actual data that we will use
        STOCK_DATABASE_PATH = "../data/stock_data.db"
        RNN_DATABASE_PATH = "../data/rnn_data.db"
        x = Preprocess(STOCK_DATABASE_PATH, RNN_DATABASE_PATH)
        numpy_data, df_data, numpy_vanilla_rnn_data, df_vanilla_rnn_data = x.get_data()
    
        X = df_data[["Open", "High", "Low", "Close", "Adj Close", "Volume", "Twitter Score"]]
    
        # Shift to get the previous data as next time step X
        X[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] = X[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].shift(-1)
        y =  df_data["Close"]
    
        # Remove last step that now has a NaN in shifted values
        X = X[:-1]
        y = y[:-1]
    
        # Manually make test_size last 20 percent
        dataset_size = len(X)
        train_prob = 1 - test_prob
        split_point = int(np.round(dataset_size * train_prob))
    
        train_data = X.iloc[:split_point,:]
        test_data = X.iloc[split_point:, :]
        train_prices = y[:split_point]
        test_prices = y[split_point:]
    
        # If want to use without the twitter data for BASELINE MODEL
        if not use_twitter:
            train_data = train_data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]]
            test_data = test_data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]]
    
        # Convert out of dataframes for use in numpy
        train_data = train_data.to_numpy().astype(np.float32)
        test_data = test_data.to_numpy().astype(np.float32)
        train_prices = train_prices.to_numpy().astype(np.float32)
        test_prices = test_prices.to_numpy().astype(np.float32)
    
    
        return train_data, test_data, train_prices, test_prices
 def test_preprocess(self):
     preprocessor = Preprocess()
     preprocessor.preprocess(self.data)
     for point in self.data:
         self.assertTrue((point['controversiality'] == 0) or (point['controversiality'] == 1))
         self.assertEqual(len(point), 5)
         for word in point['text']:
             try:
                 self.assertTrue(not word.isalpha() or word.islower())
             except AssertionError:
                 print(word)
Beispiel #31
0
    def test__check_nulls_in_index_column(self):

        df_nulls = self.spark.read.csv(
            'tests/fixtures/preprocess/nulls_recipe_id.csv', header=True)
        df_no_nulls = self.spark.read.csv(
            'tests/fixtures/preprocess/no_nulls_recipe_id.csv', header=True)

        Preprocess(df_labels=df_no_nulls, columns=[''])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=df_nulls, columns=[''])
Beispiel #32
0
    def test__check_is_spark_data_frame(self):

        df_simple_table = self.spark.read.csv(
            'tests/fixtures/preprocess/simple_table.csv', header=True)
        pd_df_simple_table = pd.read_csv(
            'tests/fixtures/preprocess/simple_table.csv')

        Preprocess(df_labels=df_simple_table, columns=[''])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=pd_df_simple_table, columns=[''])
Beispiel #33
0
 def search(self, query):
     augf = AnalyticUUIDGeneratorFactory()
     aug = augf.create()
     results = []
     for query1 in return_search_results(query.rawQuery):
         query1 = SearchQuery(type=SearchType.SENTENCES,
                              terms=query1.split(" "),
                              rawQuery=query1,
                              k=500)
         result = self.other.search(query1)
         # logging.info(result.searchResultItems)
         results.extend(result.searchResultItems)
     # results = SearchResult(searchResultItems=results, searchQuery=query)
     # logging.info(len(results))
     resultsDict = {}
     for result in results:
         resultsDict[result.sentenceId.uuidString] = result
     results = []
     for key in resultsDict:
         results.append(resultsDict[key])
     # results = results[:10] # comment out on full run
     comm_ids_list, temp = get_comm_ids(results)
     dictUUID = fetch_dataset(comm_ids_list, temp)
     inv_map = {v: k for k, v in dictUUID.items()}
     toHannah = []
     for uuid in dictUUID:
         toHannah.append([query.rawQuery, dictUUID[uuid]])
     resultItemRet = SearchResult(uuid=aug.next(),
                                  searchQuery=query,
                                  searchResultItems=results,
                                  metadata=AnnotationMetadata(
                                      tool="search",
                                      timestamp=int(time.time())),
                                  lang="eng")
     model = pickle.load(open("./trained_model.p", "rb"))
     pre = Preprocess()
     feature_matrix = pre.process_run(toHannah)
     dictRanks = pre_ranking(feature_matrix, model, toHannah, inv_map)
     results = rerank(dictRanks, resultItemRet)
     resultArr = results.searchResultItems
     resultArr = sorted(resultArr,
                        key=lambda result: result.score,
                        reverse=True)
     for item in resultArr:
         logging.info(item.score)
     resultItemRet = SearchResult(uuid=aug.next(),
                                  searchQuery=query,
                                  searchResultItems=resultArr,
                                  metadata=AnnotationMetadata(
                                      tool="search",
                                      timestamp=int(time.time())),
                                  lang="eng")
     return resultItemRet
class TopicModel(object):
    def dataPreprocess(self,path):
        self.preprocess=Preprocess()
        self.preprocess.reader(path)

    def train(self):
        self.lda=LdaModel(self.preprocess.corpus,id2word=self.preprocess.dictionary,num_topics=10)
        for topic in self.lda.print_topics(num_topics=10,num_words=10):
            print(topic[1])

    def evaluation(self):
        pass
Beispiel #35
0
    def saveModels():
        '''
        This function opens the tarfile, preprocess the data, train models on it and it
        saves the model in the Models directory.
        '''
        tar = tarfile.open('Data/babi_tasks_1-20_v1-2.tar.gz')

        challenges = {
          # QA1 with 10,000 samples
          'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
          # QA2 with 10,000 samples
          'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
        }

        ## Single Supporting Fact Challenge
        ss_train_stories, ss_test_stories, \
            ss_stories_train, ss_questions_train, ss_answers_train, \
            ss_stories_test, ss_questions_test, ss_answers_test, \
            ss_story_maxlen, ss_story_maxsents, ss_question_maxlen, \
            ss_vocab, ss_vocab_size, ss_word2idx = \
            Preprocess.getData(challenges['single_supporting_fact_10k'], tar)

        ss_idx2word = {value : key for key, value in ss_word2idx.items()}

        single_model, single_debug_model = \
            Models.singleModel(ss_story_maxlen, ss_story_maxsents, ss_question_maxlen, ss_vocab_size, \
                            ss_stories_train, ss_questions_train, ss_answers_train, \
                            ss_stories_test, ss_questions_test, ss_answers_test, \
                            EMBEDDING_DIM, NUM_EPOCHS, BATCH_SIZE)

        Utilities.saveModel(single_model, 'single_model')
        Utilities.saveModel(single_debug_model, 'single_debug_model')

        ## Two Supporting Fact challenge
        ts_train_stories, ts_test_stories, \
            ts_stories_train, ts_questions_train, ts_answers_train, \
            ts_stories_test, ts_questions_test, ts_answers_test, \
            ts_story_maxlen, ts_story_maxsents, ts_question_maxlen, \
            ts_vocab, ts_vocab_size, ts_word2idx = \
            Preprocess.getData(challenges['two_supporting_facts_10k'], tar)

        ts_idx2word = {value : key for key, value in ts_word2idx.items()}

        double_model, double_debug_model = \
            Models.doubleModel(ts_story_maxlen, ts_story_maxsents, ts_question_maxlen, ts_vocab_size, \
                            ts_stories_train, ts_questions_train, ts_answers_train, \
                            ts_stories_test, ts_questions_test, ts_answers_test, \
                            EMBEDDING_DIM, NUM_EPOCHS_2, BATCH_SIZE)

        Utilities.saveModel(double_model, 'double_model')
        Utilities.saveModel(double_debug_model, 'double_debug_model')
Beispiel #36
0
def main():

    preprocess = Preprocess(data_file, nrows)
    taxi_summary, L, A, T, p_pick, p_tran, r, t_drive, t_wait = preprocess.preprocess_data()

    print("\n\nFeature Generation Completed .....")
    print("\n\n ---- Top 10 rows ---- \n\n", taxi_summary.head())

    prediction = Prediction()

    prediction.MDP_Dynamic_Program(L, A, T, p_pick, p_tran, r, t_drive, t_wait)

    print("\n\nStarting Revenue Prediction .....")
    prediction.predict_revenue(taxi_summary)
Beispiel #37
0
    def __init__(self):
        print("tensorflow version: ", tf.__version__)

        self.dict_file = 'data/word_dict.txt'
        self.data_map = "data/map.pkl"  # pkl是cpickle模块生成的文件,用于长久保存字符串、列表、字典等数据

        self.batch_size = 20  # 每次喂进20个
        self.max_epoch = 10000  # 最大100000轮
        self.show_batch = 1  #
        self.model_path = 'model/'
        # jieba导入词典
        jieba.load_userdict(self.dict_file)

        self.location = ["杭州", "重庆", "上海", "北京"]
        self.user_info = {"__UserName__": "yw", "__Location__": "重庆"}
        self.robot_info = {"__RobotName__": "xw"}

        # 获取输入输出
        if os.path.isfile(self.data_map):
            with open(self.data_map, "rb") as f:
                data_map = cPickle.load(
                    f)  # 使用cpickle读取map.pkl文件内容返回,注意写入是什么类型数据,读取是就是什么类型数据
        else:
            p = Preprocess()
            p.main()  # 如果不存在data_map则调用Preprocess()方法重新创建向量和map
            data_map = p.data_map  # data_map是全局变量的dict,在这里可以取到

        # 从data_map中查找各个键值对并赋值,其中也存在字典嵌套
        self.encoder_vocab = data_map.get("Q_vocab")
        self.encoder_vec = data_map.get("Q_vec")
        self.encoder_vocab_size = data_map.get("Q_vocab_size")
        self.char_to_vec = self.encoder_vocab

        self.decoder_vocab = data_map.get("A_vocab")
        self.decoder_vec = data_map.get("A_vec")
        self.decoder_vocab_size = data_map.get("A_vocab_size")
        self.vec_to_char = {v: k for k, v in self.decoder_vocab.items()}

        print("encoder_vocab_size {}".format(self.encoder_vocab_size))
        print("decoder_vocab_size {}".format(self.decoder_vocab_size))
        # 调用DynamicSeq2seq()方法,将编码解码词典长度导入,初始化模型
        self.model = DynamicSeq2seq(
            encoder_vocab_size=self.encoder_vocab_size + 1,
            decoder_vocab_size=self.decoder_vocab_size + 1,
        )
        #优先给程序分配显存
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.restore_model()
Beispiel #38
0
    def norm5(cls, data):
        data = [[d] for d in data]
        print "deviation ", Preprocess.standard_deviation(data)
        # data = Preprocess.scale(data)
        # print 'deviation ', Preprocess.standard_deviation(data)
        data = [(d[0] * 10) + 0.5 for d in data]

        return data
Beispiel #39
0
def main(args):
  """Main program to run preprocessing of the font Arguments:

    font-file
    --hinting=(False|True)  ,default is false
  """
  options = Options()
  args = options.parse_opts(args, ignore_unknown=True)
  if len(args) < 1:
    print('usage: ./pyprepfnt font-file [--option=value]...', file=sys.stderr)
    sys.exit(1)

  fontfile = args[0]
  args = args[1:]

  filename, extension = os.path.splitext(fontfile)

  cleanfile = filename + '_clean' + extension
  cleanup.cleanup(fontfile, False, cleanfile)

  closure.dump_closure_map(cleanfile, '.')

  preprocess = Preprocess(cleanfile, '.')
  preprocess.base_font()
  preprocess.cmap_dump()
  preprocess.serial_glyphs()
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    preprocesser = Preprocess()
    data= preprocesser.read()

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'Auto Encoder')
    ae = AutoEncoder(
    	layers=[
		Layer("Sigmoid",units=100)
               ],
	learning_rate=0.01,
	n_iter=40,
        verbose=True,
    )
    ae.fit(data[:,:-1])

    print '[INFO, time: %s] Transforming Data with %s ...' % (time.strftime('%H:%M:%S'), 'Auto Encoder')
    splitRatio = 0.67
    train, test = splitDataset(data, splitRatio)
    train = np.asarray(train)
    test = np.asarray(test)
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    transformed_trainX = ae.transform(trainX)
    transformed_testX = ae.transform(testX)

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters')
    clf = SVC()
    clf.fit(transformed_trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(transformed_testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    preprocesser = Preprocess()
    data= preprocesser.read()

    splitRatio = 0.67
    train, test = splitDataset(data, splitRatio)
    train = np.asarray(train)
    test = np.asarray(test)

    trainX = train[:,:-1]
    trainy = train[:,-1]

    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters')
    clf = SVC()
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    preprocesser = Preprocess()
    data= preprocesser.read()

    splitRatio = 0.67
    train, test = splitDataset(data, splitRatio)
    train = np.asarray(train)
    test = np.asarray(test)

    trainX = train[:,:-1]
    trainy = train[:,-1]

    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'Gradient Boosting Classifier with 300 estimators')
    clf = GradientBoostingClassifier(n_estimators=300)
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
Beispiel #43
0
 def scoring(self, method='zagibolov'):
     # Supply argument in Corpus to connect to databse. user, password and db.
     corpus = Corpus(password='', db='project_major')
     corpus.getTweets()
     dataset = corpus.dataSet
     preprocess = Preprocess('zagibolov', self.lexicons, self.negatives, self.stopWords)
     scoring = Scoring(method, self.lexicons, self.negatives, self.stopWords, self.seeds)
     j = 0
     for data in dataset:
         preprocess.preprocessScoring(data)
         processed = preprocess.processed_data
         
     for data in processed:
         scoring.count(data['tweet'])
 ##        print self.seeds
     preprocess.seeds = scoring.lexicon_count
     preprocess.processLexicon()
     scoring.lexicons = preprocess.lexicons
 ##        print scoring.lexicon_count
     last_score = {}
     i = 0
     for i in range(0,3):
         total = 0
         j = 0
         negative = 0
         positive = 0
         scoring.resetLexiconCount()
 ##        print self.lexicons
         for data in processed:
             if j == 50:
                 break
             j += 1
             score = scoring.score(data)
             if score != 0:
                 total += 1
                 if score < 0:
                     negative += 1
                 else:
                     positive += 1
         scoring.adjustScoring()
         if last_score == {}:
             last_score = scoring.lexicons
             this_score = last_score
         else:
             this_score = scoring.lexicons
             if this_score == last_score:
                 break
             else:
                 last_score = this_score
         print this_score
         print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
     print this_score
     print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
Beispiel #44
0
    def norm4(cls, data):
        data = [[d] for d in data]
        print "deviation ", Preprocess.standard_deviation(data)
        data = [d[0] for d in data]

        data = Preprocess.root(data, 2)
        data = Preprocess.squeeze(data)
        data = Preprocess.squeeze(data)

        # data = [[d] for d in data]
        # data = Preprocess.scale(data)
        # data = [d[0] for d in data]

        data = [[d] for d in data]
        print "deviation ", Preprocess.standard_deviation(data)
        # data = preprocessing.normalize(data, norm='l2')
        data = Preprocess.norm(data)
        print "deviation ", Preprocess.standard_deviation(data)
        data = [d[0] - 0.04 for d in data]
        data = [round(d, 1) for d in data]

        return data
	def testPreprocessWithAllCases(self):
		tweet = "Won't be sleeping for an #overnight with @meyaan. This is going to be a looooooooooooooong night :( #thesis"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["not sleeping overnight", []], ["going loong night", [":("]]]
		self.assertEqual(result, expected)
__author__ = 'Thurston'

from ego import Kriging
from preprocess import Preprocess
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_player_one(2)

unit_sig = np.ones(31)

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)
#
# # get sigma estimate that maximizes the sum of expected improvements

import scipy.optimize as opt
all_sigs = np.zeros((len(pre.full_tab['id'].tolist()), 31))
all_improv = np.zeros_like(pre.full_tab['id'].tolist())
lb = 0.01
ub = 100.
bounds = [(lb, ub)]*31

for n, i in enumerate(pre.full_tab['id'].tolist()):

    a, b = pre.prep_by_id(i)
	def testPreprocessNormalTweet(self):
		tweet = "Thinking trying social media management tool? Test drive Sprout Social free today!"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["thinking trying social media management tool", []], ["test drive sprout social free today", []]]
		self.assertEqual(result, expected)
Beispiel #48
0
    mean = float(sum(dif))/len(targets)
    root = math.sqrt(mean)
    return root

""" Average of a list """
def avg(array):
    return float(sum(array))/len(array)
        

boston = datasets.load_boston()
#boston = datasets.make_regression()
#data = boston[0]
#target = boston[1]
data = boston.data
target = boston.target
matrix = Preprocess.to_matrix(list(data))
matrix = Preprocess.scale(matrix)
matrix = list(matrix)
target = list(target)
layers = [13,7,1]

dnn = DNN(matrix, target, layers, hidden_layer="TanhLayer", final_layer="LinearLayer", compression_epochs=5, smoothing_epochs=0, bias=True)
full = dnn.fit()
print full
#preds = [dnn.predict(d)[0] for d in matrix]
preds = [full.activate(d)[0] for d in matrix]

print "mrse preds {0}".format(mrse(preds, target))
print "rmse preds {0}".format(rmse(preds, target))

#mean = avg(target)
	def testPreprocessTweetWithMidHashtag(self):
		tweet = "New always-on #AndroidWear apps keep info handy for when you are on the go"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["new always-on androidwear apps keep info handy when go", []]]
		self.assertEqual(result, expected)
	def testPreprocessTweetWithEndHashtag(self):
		tweet = "THANKS FOR ALL THE QUESTIONS DURING THIS GAB! KEEP VOTING USING #ChoiceSciFiTVActress"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["thanks questions during gab", []], ["keep voting using", []]]
		self.assertEqual(result, expected)
import os.path
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import json


from ego import Kriging
from preprocess import Preprocess
from reg import CovarianceEstimate
import numpy as np

# #
# # get data from the game
# # delete the parameters if performing first-time or new player.
# # Parameters are there to speed up after saving a pkl.
pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_bad_player()

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)

########
# X, y = X[:12], y[:12]
########
import os.path
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import json


from ego import Kriging
from preprocess import Preprocess
from reg import CovarianceEstimate
import numpy as np

# #
# # get data from the game
# # delete the parameters if performing first-time or new player.
# # Parameters are there to speed up after saving a pkl.
pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_player_one(2)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)

########
n_trajectory = 12
X, y = X[:n_trajectory], y[:n_trajectory] # only use the first few plays
########
from image import Image
from preprocess import Preprocess
from classifier import Classifier
from log_loss import log_loss
from postprocess import PostProcess

genders = Image.genders()
d, _ = Image.data()
matrix = Preprocess.to_matrix(d)
print matrix.shape
matrix = Preprocess.remove_constants(matrix)
print matrix.shape
matrix = Preprocess.scale(matrix)
matrix = Preprocess.polynomial(matrix, 2)
matrix = Preprocess.scale(matrix)
print matrix.shape
matrix = matrix.tolist()
half = len(matrix)/2
train, cv = matrix[:half], matrix[half:]
train_genders, cv_genders = genders[:half], genders[half:]
cv_genders = cv_genders[0::4]
preds = Classifier.ensemble_preds(train, train_genders, cv)
print "Score: ", log_loss(preds, cv_genders)
	def testPreprocessWithMentions(self):
		tweet = "Take a trip to Central City with @MiloVentimiglia and The PET Squad"
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["take trip central city pet squad", []]]
		self.assertEqual(result, expected)
	def testPreprocessWithContractions(self):
		tweet = "this isn't real! :("
		prep = Preprocess(tweet)
		result = prep.preprocess()
		expected = [["not real", []], ["", [":("]]]
		self.assertEqual(result, expected)
Beispiel #56
0
"""
Example run, which processes an input at 60s rolling intervals,
caclulates mean degree for all intervals, and outputs degrees to
a file.

Call in terminal from root like:
    $ python src/main.py /path/to/input.txt /path/to/output.txt

Equivalent to:
    $ ./run.sh
if the input is /tweet_input/tweets.txt and the output is
/tweet_output/output.txt
"""

# def main():


if __name__ == '__main__':
    import sys
    print(sys.argv)

    pre = Preprocess(sys.argv[1])
    pre.extract()

    graph_gen = rolled_graph_gen(pre.df)

    degrees = g_stats(graph_gen, mean_deg, savename=sys.argv[2])


import json


from ego import Kriging
from preprocess import Preprocess
from reg import CovarianceEstimate
import numpy as np
from scipy.spatial.distance import pdist, cdist, squareform
from pyDOE import lhs
from scipy.misc import logsumexp
import scikits.bootstrap as boot
# #
# # get data from the game
# # delete the parameters if performing first-time or new player.
# # Parameters are there to speed up after saving a pkl.
pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl')
# pre = Preprocess()
# pre.get_json('../alluser_control.json')  # uncomment this to create the pkl file needed!!
# pre.train_pca()
X, y = pre.ready_player_one(2) # MAX: first dimension is number of plays, second is solution space dimension

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scale = StandardScaler()
scale = MinMaxScaler((-1., 1.))
X = scale.fit_transform(X)

total_no_iters = 31
n_trial = 1
dim = 31
bounds = np.array([[-1.,1.]]*31)
from preprocess import Preprocess
from featureExtract import FeatureExtract
from afinnClassifier import Afinn
from emoticons import Emoticons

global prep

iter = 1
print("Welcome!!!\n")
while iter == 1:
	tweet = raw_input("Please enter a tweet to be analyzed: ")
	prep = Preprocess(tweet) #load Preprocess class

	#preprocess input data
	data = prep.preprocess()
	print data

	#generate bigrams from preprocessed text
	for item in data:
		if not item[0]:
			item[0] = None
		else:
			bigrams = FeatureExtract(item[0]).getBigrams
			item[0] = bigrams
	print data
	
	A = Afinn()
	A.classify(data)

Beispiel #59
0
    dstdir = './'

    infiles = [ srcdir + 'WikiQA-mz-train.txt', srcdir + 'WikiQA-mz-dev.txt', srcdir + 'WikiQA-mz-test.txt']
    corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus(infiles[0], infiles[1], infiles[2])
    print('total corpus : %d ...' % (len(corpus)))
    print('total relation-train : %d ...' % (len(rel_train)))
    print('total relation-valid : %d ...' % (len(rel_valid)))
    print('total relation-test: %d ...' % (len(rel_test)))
    prepare.save_corpus(dstdir + 'corpus.txt', corpus)

    prepare.save_relation(dstdir + 'relation_train.txt', rel_train)
    prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid)
    prepare.save_relation(dstdir + 'relation_test.txt', rel_test)
    print('Preparation finished ...')

    preprocessor = Preprocess(word_stem_config={'enable': False}, word_filter_config={'min_freq': 2})
    dids, docs = preprocessor.run(dstdir + 'corpus.txt')
    preprocessor.save_word_dict(dstdir + 'word_dict.txt', True)
    preprocessor.save_words_stats(dstdir + 'word_stats.txt', True)

    fout = open(dstdir + 'corpus_preprocessed.txt', 'w')
    for inum, did in enumerate(dids):
        fout.write('%s %s %s\n' % (did, len(docs[inum]), ' '.join(map(str, docs[inum]))))
    fout.close()
    print('Preprocess finished ...')

    # dssm_corp_input = dstdir + 'corpus_preprocessed.txt'
    # dssm_corp_output = dstdir + 'corpus_preprocessed_dssm.txt'
    word_dict_input = dstdir + 'word_dict.txt'
    triletter_dict_output = dstdir + 'triletter_dict.txt'
    word_triletter_output = dstdir + 'word_triletter_map.txt'
Beispiel #60
0
from preprocess import Preprocess

p = Preprocess()
p.preprocessDirectory()