def main(): # setup logging -------------------------- logging.basicConfig(filename='plsa.log', level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S') #console = logging.StreamHandler() #console.setLevel(logging.INFO) #logging.getLogger('').addHandler(console) # some basic configuration --------------- fname = './data.txt' fsw = './stopwords.txt' eps = 20.0 key_word_size = 10 # preprocess ----------------------------- pp = PP(fname, fsw) w_d = pp.get_w_d() V, D = w_d.shape logging.info('V = %d, D = %d' % (V, D)) # train model and get result ------------- pmodel = PLSA() for z in range(3, (D+1), 10): t1 = time.clock() (l, p_d_z, p_w_z, p_z) = pmodel.train(w_d, z, eps) t2 = time.clock() logging.info('z = %d, eps = %f, time = %f' % (z, l, t2-t1)) for itz in range(z): logging.info('Topic %d' % itz) data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:,itz]))] data.sort(key=lambda tup:tup[0], reverse=True) for i in range(key_word_size): logging.info('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))
def testPreprocessWithElongatedWords(self): tweet = "dili kaayu klaro imuha :( HAHAHAHA haaaaaaaaaaaaaaays! dapat ipa zoom ang nawong pa more HAHAHAHA :P" prep = Preprocess(tweet) result = prep.preprocess() expected = [["dili kaayo klaro imuha", [":("]], ["hahahaha haays", []], ["dapat ipa zoom ang nawong pa more " "hahahaha", [":P"]]] self.assertEqual(result, expected)
def __init__(self): cursor.execute("SELECT content FROM data") scripts = cursor.fetchall() fw = open('vector.txt', 'w') fresult = open('result.txt', 'w') mPreprocess = Preprocess() mPairedToken = PairToken() mConvertVector = ConvertVector() stanford = StanfordCoreNLP('http://localhost:9000') for script in scripts: # if type(script) is tuple: listToken = mPreprocess.exec(script[0]) # else: # listToken = mPreprocess.exec(script) listCouple = mPairedToken.exec(listToken) output = stanford.annotate(script[0], properties={'annotators': 'coref', 'outputFormat': 'json'}) for mCoupleToken in listCouple: if self.checkCoreF(output['corefs'], mCoupleToken): # fresult.write(str(1) + ' ' + mCoupleToken.np1.text + ' ' + mCoupleToken.np2.text) fresult.write(str(1)) fresult.write('\n') else: # fresult.write(str(-1) + ' ' + mCoupleToken.np1.text + ' ' + mCoupleToken.np2.text) fresult.write(str(-1)) fresult.write('\n') vector = mConvertVector.exec(mCoupleToken) fw.write(str(vector)) fw.write('\n')
def main(): # setup logging -------------------------- logging.basicConfig(filename='plsa.log', level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S') #console = logging.StreamHandler() #console.setLevel(logging.INFO) #logging.getLogger('').addHandler(console) # some basic configuration --------------- fname = './data.txt' fsw = './stopwords.txt' eps = 20.0 key_word_size = 10 # preprocess ----------------------------- pp = PP(fname, fsw) w_d = pp.get_w_d() V, D = w_d.shape logging.info('V = %d, D = %d' % (V, D)) # train model and get result ------------- pmodel = PLSA() for z in range(3, (D+1), 10): t1 = time.clock() (l, p_d_z, p_w_z, p_z) = pmodel.train(w_d, z, eps) t2 = time.clock() #logging.info('z = %d, eps = %f, time = %f' % (z, l, t2-t1)) #print ('z = %d, eps = %f, time = %f' % (z, l, t2-t1)) for itz in range(z): logging.info('Topic %d' % itz) data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:,itz]))] data.sort(key=lambda tup:tup[0], reverse=True) for i in range(key_word_size): logging.info('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0])) print ('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))
def main(): '''Training of the model on the preprocessed data. ''' preprocess = Preprocess() data = preprocess.getData( path="creditcard.csv", # path of the csv file feature_incides=[0, 29], # column indices of the features label_indices=[30], # column indices of the labels training_size=0.5, # size for the training set standardize=True, # apply standardization? eval_set=True # create evaluation set? ) model = Model( batch_size=10, # size of the training batch epochs=50, # number of training epochs nodes=[ 29, 200, 2 ], # List of neurons, first entry is the number of input, last entry # the number of output neurons. The values in between are the hidden neurons. learning_rate=0.0001, # learning rate for the training hidden_activation= "sigmoid", # activation function for the hidden nodes, choose between "tanh", "sigmoid" and "relu" output_activation= "linear", # activation function for the output nodes, choose between "tanh", "sigmoid" and "linear" data=data, # the loaded and preprocessed data form the csv file do_eval=True # measure accuracy of the evaluation set? ) model.train()
def __init__(self, data_path): self.path = data_path self.preprocess = Preprocess() self.gender = [] self.userid = [] self.model = Model() self.model.load_model()
def LoadData(): print("Preprocess the dataset...", end = ' ') preprocess = Preprocess() SRC, TRG, tr, valid, ts = preprocess.Build() print("DONE") return SRC,TRG, tr, valid, ts
def closed_form_extra_features(self): preprocess1 = Preprocess() x_set = preprocess1.matrixify(self.data, 60) y_set = Preprocess.get_y(self.data) lengths = [] length_squared = [] for datapoint in self.data: text_length = len(datapoint['text']) lengths.append(text_length) children_length_inter = [] children_list = [] log_children_list = [] for datapoint in self.data: children_list.append(datapoint['children']) if datapoint['children'] != 0: log_children_list.append(math.log(datapoint['children'])) else: log_children_list.append(0) for length, children in zip(lengths, children_list): children_length_inter.append(length * children) # preprocess1.add_features(children_length_inter) # x_set = preprocess1.add_features(log_children_list) # x_set = feature_selector.backwardElimination(x_set, y_set, 0.5) return self.run_model(x_set, y_set)
def __init__(self): self.vect = TfidfVectorizer() self.data = None self.vect_data = None self.pre = Preprocess()
def display_training_and_validation_error(self): num_words = 160 word_nums = np.arange(num_words) val_error_list = [] train_error_list = [] preprocess1 = Preprocess() x_set = preprocess1.matrixify(self.data, num_words) y_set = Preprocess.get_y(self.data) for x in word_nums: cur = x_set[:, 3:3 + x] print("Running on top " + str(x) + " words") val_error, train_error = self.run_model(cur, y_set) val_error_list.append(val_error) train_error_list.append(train_error) fig, ax = plt.subplots() plt.scatter(word_nums, val_error_list, color='blue', s=5, label="Validation set") plt.scatter(word_nums, train_error_list, color='red', s=5, label="Training set") plt.title("MSE vs number of words used") ax.set_xlabel("Words Used") ax.set_ylabel("MSE") plt.legend(loc='upper right') plt.show()
def getConstraints(self, setnumber=""): try: preprocess = Preprocess() absolute_path = path.join(self._path, self._data['params']) count = [] constraints = [] for filename in listdir(absolute_path): match = re.match(self._patterns['params'], filename) if match: if (match.group(2) == setnumber): count.append(match.group(3)) with open(path.join(absolute_path, filename), "r") as c: constraints.append( preprocess.preprocessConstraints( c.read().split("\n"))) if constraints == []: return {"error": True, "message": "Something's up"} return {"error": False, "constraints": constraints, "count": count} except FileNotFoundError: return { "error": True, "message": """Files not found. Please make sure that there is a directory called 'params' in the given path, with the files named as params.txt or params1.txt or params1-1.txt""" }
def elabora(self,path): #print path img = cv2.imread(path,0) #Trasformazione pp = Preprocess() img = pp.applyTransform(img,300,300) descriptorValues = [] locations = [] #hd = cv2.HOGDescriptor((32,64), (16,16), (8,8), (8,8), 9) #hd = cv2.HOGDescriptor() hd = cv2.HOGDescriptor((16,16), (16,16), (8,8), (8,8),9) #hd = cv2.HOGDescriptor() #print "Lunghezza di hd: "+str(len(hd)) res = hd.compute(img) #ls = res.tolist() #print str(len(ls)) """ des = res[0] for i in xrange(1,len(res)): des = np.concatenate([des,res[i]]) """ return res.ravel() #print ls #print str(len(res)) #for i in range(0,len(ls)): # des = des+ls[i] print "#####################"
def loadData(): ''' This function loads the data from various data files and does the basic preprocessing. Created to leverage the power of streamlit cache. ''' movies_df = Preprocess.loadFile("movies") ratings_df = Preprocess.loadFile("ratings") final_vector_df = Util.loadObj('final_vector_df') embeddings_matrix = final_vector_df.loc[:, final_vector_df. columns != 'movieId'] embedding_movie_list = final_vector_df['movieId'].tolist() ratings_df2 = Preprocess.loadFile("ratings") # ratings_input = [ratings_df['userId'].to_numpy(), ratings_df['movieId'].to_numpy(), ratings_df['rating'].to_numpy()] users = list(set(ratings_df['userId'].tolist())) movies = list(set(ratings_df['movieId'].tolist())) users_dict = {u: i for i, u in enumerate(users)} movies_dict = {m: i for i, m in enumerate(movies)} # Movie Id to Idx movies_idx_dict = {i: m for i, m in enumerate(movies)} #Idx to movie Id ratings_df2['userId'] = ratings_df2['userId'].apply( lambda x: users_dict[x]) ratings_df2['movieId'] = ratings_df2['movieId'].apply( lambda x: movies_dict[x]) return movies_df, ratings_df, final_vector_df, embeddings_matrix, embedding_movie_list, ratings_df2, users, movies, users_dict, movies_dict, movies_idx_dict
def train(): from gensim.models import word2vec from preprocess import Preprocess, API_download import glob #train_dataから学習に用いるテキストを選択 docs = [] pathlist = glob.glob("../data/train_data/*") for path in pathlist: f = open(path) text = f.read() f.close() docs.append(text) #テキストの前準備 tagger = API_download.mecab_download() word_lists = [] for doc in docs: text = Preprocess.cleaning_text(doc) word_class = Preprocess.mecab_list(text, tagger) word_list = [] for word in word_class: word_list.append(word[0]) word_lists.append(word_list) #学習をさせ、モデルを作る model = word2vec.Word2Vec(word_lists, size=200, min_count=1, window=5, iter=100) return model
def debug(folders, n_components, r = None, max_dimension = 1): X,y = load_dataset(folders) p = Preprocess(n_components) X = p.fit_transform(X) if r is None: distances = PairwiseDistances(X.tolist()) distances = ExplicitDistances(distances) n_samples = len(X) r_candidates = sorted(set(np.array(distances.distances).flatten())) for r2 in r_candidates: print r2 cx = vietoris_rips(X.tolist(), max_dimension, r2) cords = mds_plot(X, y) lines_plot(cx, cords) plt.show() else: cx = vietoris_rips(X.tolist(), max_dimension, r) actual_max_dimension = len(max(cx, key=len)) - 1 for d in range(actual_max_dimension, 2, -1): sx_d = filter_simplices(cx, d) print "dimension", d, ":", len(sx_d), "simplices" for i, sx in enumerate(sx_d): print i, "..." cords = mds_plot(X, y) edges = list(combinations(sx, 2)) lines_plot(edges, cords, color=np.random.rand(3,)) plt.show()
class Tokenizer(): def __init__(self, word_level=False, preprocess=True, lang='zh'): self.tp = Preprocess(lang=lang) self.word_level = word_level self.preprocess = preprocess self.lang = lang def tokenize_str(self, x): if self.preprocess: if self.word_level: x = self.tp.preprocess([x])[0] else: x = self.tp.clean([x],drop_space=True)[0] if self.word_level: tokens = x.split(' ') else: tokens = [t for t in x] return tokens def __call__(self, X): if type(X) is str: return self.tokenize_str(X) else: tokens_list = [] for x in X: tokens_list.append(self.tokenize_str(x)) return tokens_list
def database(): from preprocess import Preprocess, API_download import glob #docsにテキストの集合が、id2docにテキスト名が入る docs = [] id2doc = [] pathlist = glob.glob("../data/comparison_data/*") for path in pathlist: f = open(path) text = f.read() f.close() docs.append(text) id2doc.append(path) print(id2doc) #docsには文章のリストが入っている tagger = API_download.mecab_download() word_lists = [] for doc in docs: text = Preprocess.cleaning_text(doc) word_class = Preprocess.mecab_list(text, tagger) noun_list = Preprocess.noun_extract(word_class) noun_list2 = Preprocess.noun_squeeze(noun_list) word_lists.append(noun_list2) return word_lists, id2doc
def prepare_data(dataset, pca_n): global n_classes, X, y, pp, X_tr, X_inv n_classes = len(dataset) X, y = load_dataset(dataset) pp = Preprocess(pca_n) X_tr = pp.fit_transform(X) X_inv = pp.inverse_transform(X_tr)
def preprocessing(self, method='zagibolov'): preprocess = Preprocess(method, self.lexicons, self.negatives, self.stopWords) for data in self.corpus: preprocess.preprocess(data) lexicons = preprocess.lexicons self.lexicons = dict(self.lexicons.items() + lexicons.items()) self.seeds = preprocess.seeds
def main(model_num=1): preprocess = Preprocess() texts_train, labels_train = preprocess.preprocessData( '../projet2/train.txt', mode="train") texts_dev, labels_dev = preprocess.preprocessData('../projet2/dev.txt', mode="train") MAX_SEQUENCE_LENGTH = 24 LSTM_DIM = 64 HIDDEN_LAYER_DIM = 30 NUM_CLASSES = 4 GAUSSIAN_NOISE = 0.1 DROPOUT = 0.2 DROPOUT_LSTM = 0.2 BATCH_SIZE = 200 X_train, X_val, y_train, y_val = train_test_split(texts_train, labels_train, test_size=0.2, random_state=42) labels_categorical_train = to_categorical(np.asarray(y_train)) labels_categorical_val = to_categorical(np.asarray(y_val)) labels_categorical_dev = to_categorical(np.asarray(labels_dev)) embedding = Embedding('../projet2/emosense.300d.txt') embeddings = embedding.getMatrix() tokenizer = embedding.getTokenizer() message_first_message_train, message_second_message_train, message_third_message_train = get_sequences( X_train, MAX_SEQUENCE_LENGTH, tokenizer) message_first_message_val, message_second_message_val, message_third_message_val = get_sequences( X_val, MAX_SEQUENCE_LENGTH, tokenizer) message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequences( texts_dev, MAX_SEQUENCE_LENGTH, tokenizer) model = CustomModel(model_num) model.build(embeddings, MAX_SEQUENCE_LENGTH, LSTM_DIM, HIDDEN_LAYER_DIM, NUM_CLASSES, noise=GAUSSIAN_NOISE, dropout_lstm=DROPOUT_LSTM, dropout=DROPOUT) model.summary() history = model.train(message_first_message_train, message_second_message_train, message_third_message_train, labels_categorical_train, message_first_message_val, message_second_message_val, message_third_message_val, labels_categorical_val) y_pred = model.predict([ message_first_message_dev, message_second_message_dev, message_third_message_dev ])
def main(): preprocess = Preprocess() preprocess.check_data_distribution() print "\n\n*********** ANALYSIS PART I *******************" partI_classifier = Classifiers(1) partI_classifier.draw_auc_curve(1)
def dataPreprocess(self,path): self.preprocess=Preprocess() self.preprocess.reader(path) # 划分数据集 self.train_data,self.test_data,self.train_labels,self.test_labels=train_test_split(self.preprocess.sentences,self.preprocess.labels ,test_size=0.3) print(self.train_labels[:100]) self.xgb_train=xgb.DMatrix(np.array(self.train_data),label=np.array(self.train_labels)) self.xgb_test=xgb.DMatrix(np.array(self.test_data))
def test_remove_non_alpha(self): preprocessor = Preprocess() preprocessor.preprocess_remove_non_alpha(self.data) for point in self.data: for word in point['text']: try: self.assertTrue(word.isalpha()) except AssertionError: print(word)
def test__check_is_list(self): df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv', header=True) Preprocess(df_labels=df_long, columns=['country', 'protein']) with self.assertRaises(AssertionError): Preprocess(df_labels=df_long, columns='protein')
def __init__(self): """initialize dataset and load model""" self.model = load_model(config.model_path) print("[Log] Pretrained model was loaded.") self.preprocess = Preprocess(database_path=config.database_path) print("[Log] Preprocess object was created.") self.database = self.init_database()
def unify_terms(self,_term_list): ''' unify terms in the term list @param _term_mat: [[term]] @return: [[term]] ''' _term_list = Preprocess.word_seg([_term_list,],self._word_seg_config)[0] # fit in input type of [[term]] _term_list = Preprocess.word_lower([_term_list,])[0] return _term_list
def build_feature_matrix(self, file_name): # with open('dataset/test.csv', 'rb') as f: with open(file_name, 'rb') as f: p = Preprocess() # "results" contains preprocessed tweet # word list contains all distinct words in training data results = []; word_list = [] # dataM contains every tweet's feature vector # cataM contains every tweet's catagory vector dataM = []; cataM = [] # read training data reader = csv.reader(f) first = True # read stop words from file stop_words = p.get_stop_word_list('stopWords.txt') for row in reader: if first: first = False continue # the 16th column is about tweet processed_tweet = p.basic_process(row[15]) # feature_vector = getFeatureVector(processed_tweet) feature_vector, word_list = p.get_fea_vector_and_wordlist(processed_tweet, word_list, stop_words) # record feature vector for each tweet results.append(feature_vector) # record category for each vector cataM.append(self.get_category(row[5])) word_list = sorted(word_list) for i in range(len(results)): # combine feature vector and category together dataM.append(self.data_matrix(results[i], word_list) + cataM[i]) f.close() # write feature matrix to the file with open('featureMatrix.csv', 'wb') as fp: writer = csv.writer(fp) # write world list into '.csv' file writer.writerow(word_list) # write feature number matrix into '.csv' file for row in dataM: writer.writerow(row) fp.close()
def get_txt(self): pp = Preprocess() # open doc original filename = self.files[0].replace('\n', '') # print filename with open(filename, 'r') as txt: txt = txt.read().replace('\n', ' ').replace('\r', ' ') txt = pp.prep_text(txt) return ((filename, txt))
def get_data(test_prob=0.2, on_rnn_set = False, use_twitter=True): # FIXME: Just uncomment this stuff out if on_rnn_set: conn = sqlite3.connect("../data/rnn_data.db") data = pd.read_sql("SELECT * FROM RNNData", conn).to_numpy() data_without_date = data[:,1:].astype(np.float32) total_points = np.shape(data_without_date)[0] train_data = data_without_date[:int((1-test_prob)*total_points)] test_data = data_without_date[int((1-test_prob)*total_points):] train_prices = train_data[1:,3] test_prices = test_data[1:,3] train_data = train_data[:-1] test_data = test_data[:-1] return train_data, test_data, train_prices, test_prices else: # Preprocess the actual data # Parsing actual data that we will use STOCK_DATABASE_PATH = "../data/stock_data.db" RNN_DATABASE_PATH = "../data/rnn_data.db" x = Preprocess(STOCK_DATABASE_PATH, RNN_DATABASE_PATH) numpy_data, df_data, numpy_vanilla_rnn_data, df_vanilla_rnn_data = x.get_data() X = df_data[["Open", "High", "Low", "Close", "Adj Close", "Volume", "Twitter Score"]] # Shift to get the previous data as next time step X X[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] = X[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].shift(-1) y = df_data["Close"] # Remove last step that now has a NaN in shifted values X = X[:-1] y = y[:-1] # Manually make test_size last 20 percent dataset_size = len(X) train_prob = 1 - test_prob split_point = int(np.round(dataset_size * train_prob)) train_data = X.iloc[:split_point,:] test_data = X.iloc[split_point:, :] train_prices = y[:split_point] test_prices = y[split_point:] # If want to use without the twitter data for BASELINE MODEL if not use_twitter: train_data = train_data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] test_data = test_data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] # Convert out of dataframes for use in numpy train_data = train_data.to_numpy().astype(np.float32) test_data = test_data.to_numpy().astype(np.float32) train_prices = train_prices.to_numpy().astype(np.float32) test_prices = test_prices.to_numpy().astype(np.float32) return train_data, test_data, train_prices, test_prices
def test_preprocess(self): preprocessor = Preprocess() preprocessor.preprocess(self.data) for point in self.data: self.assertTrue((point['controversiality'] == 0) or (point['controversiality'] == 1)) self.assertEqual(len(point), 5) for word in point['text']: try: self.assertTrue(not word.isalpha() or word.islower()) except AssertionError: print(word)
def test__check_nulls_in_index_column(self): df_nulls = self.spark.read.csv( 'tests/fixtures/preprocess/nulls_recipe_id.csv', header=True) df_no_nulls = self.spark.read.csv( 'tests/fixtures/preprocess/no_nulls_recipe_id.csv', header=True) Preprocess(df_labels=df_no_nulls, columns=['']) with self.assertRaises(AssertionError): Preprocess(df_labels=df_nulls, columns=[''])
def test__check_is_spark_data_frame(self): df_simple_table = self.spark.read.csv( 'tests/fixtures/preprocess/simple_table.csv', header=True) pd_df_simple_table = pd.read_csv( 'tests/fixtures/preprocess/simple_table.csv') Preprocess(df_labels=df_simple_table, columns=['']) with self.assertRaises(AssertionError): Preprocess(df_labels=pd_df_simple_table, columns=[''])
def search(self, query): augf = AnalyticUUIDGeneratorFactory() aug = augf.create() results = [] for query1 in return_search_results(query.rawQuery): query1 = SearchQuery(type=SearchType.SENTENCES, terms=query1.split(" "), rawQuery=query1, k=500) result = self.other.search(query1) # logging.info(result.searchResultItems) results.extend(result.searchResultItems) # results = SearchResult(searchResultItems=results, searchQuery=query) # logging.info(len(results)) resultsDict = {} for result in results: resultsDict[result.sentenceId.uuidString] = result results = [] for key in resultsDict: results.append(resultsDict[key]) # results = results[:10] # comment out on full run comm_ids_list, temp = get_comm_ids(results) dictUUID = fetch_dataset(comm_ids_list, temp) inv_map = {v: k for k, v in dictUUID.items()} toHannah = [] for uuid in dictUUID: toHannah.append([query.rawQuery, dictUUID[uuid]]) resultItemRet = SearchResult(uuid=aug.next(), searchQuery=query, searchResultItems=results, metadata=AnnotationMetadata( tool="search", timestamp=int(time.time())), lang="eng") model = pickle.load(open("./trained_model.p", "rb")) pre = Preprocess() feature_matrix = pre.process_run(toHannah) dictRanks = pre_ranking(feature_matrix, model, toHannah, inv_map) results = rerank(dictRanks, resultItemRet) resultArr = results.searchResultItems resultArr = sorted(resultArr, key=lambda result: result.score, reverse=True) for item in resultArr: logging.info(item.score) resultItemRet = SearchResult(uuid=aug.next(), searchQuery=query, searchResultItems=resultArr, metadata=AnnotationMetadata( tool="search", timestamp=int(time.time())), lang="eng") return resultItemRet
class TopicModel(object): def dataPreprocess(self,path): self.preprocess=Preprocess() self.preprocess.reader(path) def train(self): self.lda=LdaModel(self.preprocess.corpus,id2word=self.preprocess.dictionary,num_topics=10) for topic in self.lda.print_topics(num_topics=10,num_words=10): print(topic[1]) def evaluation(self): pass
def saveModels(): ''' This function opens the tarfile, preprocess the data, train models on it and it saves the model in the Models directory. ''' tar = tarfile.open('Data/babi_tasks_1-20_v1-2.tar.gz') challenges = { # QA1 with 10,000 samples 'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt', # QA2 with 10,000 samples 'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt', } ## Single Supporting Fact Challenge ss_train_stories, ss_test_stories, \ ss_stories_train, ss_questions_train, ss_answers_train, \ ss_stories_test, ss_questions_test, ss_answers_test, \ ss_story_maxlen, ss_story_maxsents, ss_question_maxlen, \ ss_vocab, ss_vocab_size, ss_word2idx = \ Preprocess.getData(challenges['single_supporting_fact_10k'], tar) ss_idx2word = {value : key for key, value in ss_word2idx.items()} single_model, single_debug_model = \ Models.singleModel(ss_story_maxlen, ss_story_maxsents, ss_question_maxlen, ss_vocab_size, \ ss_stories_train, ss_questions_train, ss_answers_train, \ ss_stories_test, ss_questions_test, ss_answers_test, \ EMBEDDING_DIM, NUM_EPOCHS, BATCH_SIZE) Utilities.saveModel(single_model, 'single_model') Utilities.saveModel(single_debug_model, 'single_debug_model') ## Two Supporting Fact challenge ts_train_stories, ts_test_stories, \ ts_stories_train, ts_questions_train, ts_answers_train, \ ts_stories_test, ts_questions_test, ts_answers_test, \ ts_story_maxlen, ts_story_maxsents, ts_question_maxlen, \ ts_vocab, ts_vocab_size, ts_word2idx = \ Preprocess.getData(challenges['two_supporting_facts_10k'], tar) ts_idx2word = {value : key for key, value in ts_word2idx.items()} double_model, double_debug_model = \ Models.doubleModel(ts_story_maxlen, ts_story_maxsents, ts_question_maxlen, ts_vocab_size, \ ts_stories_train, ts_questions_train, ts_answers_train, \ ts_stories_test, ts_questions_test, ts_answers_test, \ EMBEDDING_DIM, NUM_EPOCHS_2, BATCH_SIZE) Utilities.saveModel(double_model, 'double_model') Utilities.saveModel(double_debug_model, 'double_debug_model')
def main(): preprocess = Preprocess(data_file, nrows) taxi_summary, L, A, T, p_pick, p_tran, r, t_drive, t_wait = preprocess.preprocess_data() print("\n\nFeature Generation Completed .....") print("\n\n ---- Top 10 rows ---- \n\n", taxi_summary.head()) prediction = Prediction() prediction.MDP_Dynamic_Program(L, A, T, p_pick, p_tran, r, t_drive, t_wait) print("\n\nStarting Revenue Prediction .....") prediction.predict_revenue(taxi_summary)
def __init__(self): print("tensorflow version: ", tf.__version__) self.dict_file = 'data/word_dict.txt' self.data_map = "data/map.pkl" # pkl是cpickle模块生成的文件,用于长久保存字符串、列表、字典等数据 self.batch_size = 20 # 每次喂进20个 self.max_epoch = 10000 # 最大100000轮 self.show_batch = 1 # self.model_path = 'model/' # jieba导入词典 jieba.load_userdict(self.dict_file) self.location = ["杭州", "重庆", "上海", "北京"] self.user_info = {"__UserName__": "yw", "__Location__": "重庆"} self.robot_info = {"__RobotName__": "xw"} # 获取输入输出 if os.path.isfile(self.data_map): with open(self.data_map, "rb") as f: data_map = cPickle.load( f) # 使用cpickle读取map.pkl文件内容返回,注意写入是什么类型数据,读取是就是什么类型数据 else: p = Preprocess() p.main() # 如果不存在data_map则调用Preprocess()方法重新创建向量和map data_map = p.data_map # data_map是全局变量的dict,在这里可以取到 # 从data_map中查找各个键值对并赋值,其中也存在字典嵌套 self.encoder_vocab = data_map.get("Q_vocab") self.encoder_vec = data_map.get("Q_vec") self.encoder_vocab_size = data_map.get("Q_vocab_size") self.char_to_vec = self.encoder_vocab self.decoder_vocab = data_map.get("A_vocab") self.decoder_vec = data_map.get("A_vec") self.decoder_vocab_size = data_map.get("A_vocab_size") self.vec_to_char = {v: k for k, v in self.decoder_vocab.items()} print("encoder_vocab_size {}".format(self.encoder_vocab_size)) print("decoder_vocab_size {}".format(self.decoder_vocab_size)) # 调用DynamicSeq2seq()方法,将编码解码词典长度导入,初始化模型 self.model = DynamicSeq2seq( encoder_vocab_size=self.encoder_vocab_size + 1, decoder_vocab_size=self.decoder_vocab_size + 1, ) #优先给程序分配显存 gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.restore_model()
def norm5(cls, data): data = [[d] for d in data] print "deviation ", Preprocess.standard_deviation(data) # data = Preprocess.scale(data) # print 'deviation ', Preprocess.standard_deviation(data) data = [(d[0] * 10) + 0.5 for d in data] return data
def main(args): """Main program to run preprocessing of the font Arguments: font-file --hinting=(False|True) ,default is false """ options = Options() args = options.parse_opts(args, ignore_unknown=True) if len(args) < 1: print('usage: ./pyprepfnt font-file [--option=value]...', file=sys.stderr) sys.exit(1) fontfile = args[0] args = args[1:] filename, extension = os.path.splitext(fontfile) cleanfile = filename + '_clean' + extension cleanup.cleanup(fontfile, False, cleanfile) closure.dump_closure_map(cleanfile, '.') preprocess = Preprocess(cleanfile, '.') preprocess.base_font() preprocess.cmap_dump() preprocess.serial_glyphs()
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) preprocesser = Preprocess() data= preprocesser.read() print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'Auto Encoder') ae = AutoEncoder( layers=[ Layer("Sigmoid",units=100) ], learning_rate=0.01, n_iter=40, verbose=True, ) ae.fit(data[:,:-1]) print '[INFO, time: %s] Transforming Data with %s ...' % (time.strftime('%H:%M:%S'), 'Auto Encoder') splitRatio = 0.67 train, test = splitDataset(data, splitRatio) train = np.asarray(train) test = np.asarray(test) trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] transformed_trainX = ae.transform(trainX) transformed_testX = ae.transform(testX) print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters') clf = SVC() clf.fit(transformed_trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(transformed_testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) preprocesser = Preprocess() data= preprocesser.read() splitRatio = 0.67 train, test = splitDataset(data, splitRatio) train = np.asarray(train) test = np.asarray(test) trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters') clf = SVC() clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) preprocesser = Preprocess() data= preprocesser.read() splitRatio = 0.67 train, test = splitDataset(data, splitRatio) train = np.asarray(train) test = np.asarray(test) trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'Gradient Boosting Classifier with 300 estimators') clf = GradientBoostingClassifier(n_estimators=300) clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def scoring(self, method='zagibolov'): # Supply argument in Corpus to connect to databse. user, password and db. corpus = Corpus(password='', db='project_major') corpus.getTweets() dataset = corpus.dataSet preprocess = Preprocess('zagibolov', self.lexicons, self.negatives, self.stopWords) scoring = Scoring(method, self.lexicons, self.negatives, self.stopWords, self.seeds) j = 0 for data in dataset: preprocess.preprocessScoring(data) processed = preprocess.processed_data for data in processed: scoring.count(data['tweet']) ## print self.seeds preprocess.seeds = scoring.lexicon_count preprocess.processLexicon() scoring.lexicons = preprocess.lexicons ## print scoring.lexicon_count last_score = {} i = 0 for i in range(0,3): total = 0 j = 0 negative = 0 positive = 0 scoring.resetLexiconCount() ## print self.lexicons for data in processed: if j == 50: break j += 1 score = scoring.score(data) if score != 0: total += 1 if score < 0: negative += 1 else: positive += 1 scoring.adjustScoring() if last_score == {}: last_score = scoring.lexicons this_score = last_score else: this_score = scoring.lexicons if this_score == last_score: break else: last_score = this_score print this_score print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive print this_score print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
def norm4(cls, data): data = [[d] for d in data] print "deviation ", Preprocess.standard_deviation(data) data = [d[0] for d in data] data = Preprocess.root(data, 2) data = Preprocess.squeeze(data) data = Preprocess.squeeze(data) # data = [[d] for d in data] # data = Preprocess.scale(data) # data = [d[0] for d in data] data = [[d] for d in data] print "deviation ", Preprocess.standard_deviation(data) # data = preprocessing.normalize(data, norm='l2') data = Preprocess.norm(data) print "deviation ", Preprocess.standard_deviation(data) data = [d[0] - 0.04 for d in data] data = [round(d, 1) for d in data] return data
def testPreprocessWithAllCases(self): tweet = "Won't be sleeping for an #overnight with @meyaan. This is going to be a looooooooooooooong night :( #thesis" prep = Preprocess(tweet) result = prep.preprocess() expected = [["not sleeping overnight", []], ["going loong night", [":("]]] self.assertEqual(result, expected)
__author__ = 'Thurston' from ego import Kriging from preprocess import Preprocess import numpy as np from sklearn.preprocessing import StandardScaler, MinMaxScaler pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl') # pre = Preprocess() # pre.get_json('../alluser_control.json') # uncomment this to create the pkl file needed!! # pre.train_pca() X, y = pre.ready_player_one(2) unit_sig = np.ones(31) # scale = StandardScaler() scale = MinMaxScaler((-1., 1.)) X = scale.fit_transform(X) # # # get sigma estimate that maximizes the sum of expected improvements import scipy.optimize as opt all_sigs = np.zeros((len(pre.full_tab['id'].tolist()), 31)) all_improv = np.zeros_like(pre.full_tab['id'].tolist()) lb = 0.01 ub = 100. bounds = [(lb, ub)]*31 for n, i in enumerate(pre.full_tab['id'].tolist()): a, b = pre.prep_by_id(i)
def testPreprocessNormalTweet(self): tweet = "Thinking trying social media management tool? Test drive Sprout Social free today!" prep = Preprocess(tweet) result = prep.preprocess() expected = [["thinking trying social media management tool", []], ["test drive sprout social free today", []]] self.assertEqual(result, expected)
mean = float(sum(dif))/len(targets) root = math.sqrt(mean) return root """ Average of a list """ def avg(array): return float(sum(array))/len(array) boston = datasets.load_boston() #boston = datasets.make_regression() #data = boston[0] #target = boston[1] data = boston.data target = boston.target matrix = Preprocess.to_matrix(list(data)) matrix = Preprocess.scale(matrix) matrix = list(matrix) target = list(target) layers = [13,7,1] dnn = DNN(matrix, target, layers, hidden_layer="TanhLayer", final_layer="LinearLayer", compression_epochs=5, smoothing_epochs=0, bias=True) full = dnn.fit() print full #preds = [dnn.predict(d)[0] for d in matrix] preds = [full.activate(d)[0] for d in matrix] print "mrse preds {0}".format(mrse(preds, target)) print "rmse preds {0}".format(rmse(preds, target)) #mean = avg(target)
def testPreprocessTweetWithMidHashtag(self): tweet = "New always-on #AndroidWear apps keep info handy for when you are on the go" prep = Preprocess(tweet) result = prep.preprocess() expected = [["new always-on androidwear apps keep info handy when go", []]] self.assertEqual(result, expected)
def testPreprocessTweetWithEndHashtag(self): tweet = "THANKS FOR ALL THE QUESTIONS DURING THIS GAB! KEEP VOTING USING #ChoiceSciFiTVActress" prep = Preprocess(tweet) result = prep.preprocess() expected = [["thanks questions during gab", []], ["keep voting using", []]] self.assertEqual(result, expected)
import os.path sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) import json from ego import Kriging from preprocess import Preprocess from reg import CovarianceEstimate import numpy as np # # # # get data from the game # # delete the parameters if performing first-time or new player. # # Parameters are there to speed up after saving a pkl. pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl') # pre = Preprocess() # pre.get_json('../alluser_control.json') # uncomment this to create the pkl file needed!! # pre.train_pca() X, y = pre.ready_bad_player() from sklearn.preprocessing import StandardScaler, MinMaxScaler # scale = StandardScaler() scale = MinMaxScaler((-1., 1.)) X = scale.fit_transform(X) ######## # X, y = X[:12], y[:12] ########
import os.path sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) import json from ego import Kriging from preprocess import Preprocess from reg import CovarianceEstimate import numpy as np # # # # get data from the game # # delete the parameters if performing first-time or new player. # # Parameters are there to speed up after saving a pkl. pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl') # pre = Preprocess() # pre.get_json('../alluser_control.json') # uncomment this to create the pkl file needed!! # pre.train_pca() X, y = pre.ready_player_one(2) from sklearn.preprocessing import StandardScaler, MinMaxScaler # scale = StandardScaler() scale = MinMaxScaler((-1., 1.)) X = scale.fit_transform(X) ######## n_trajectory = 12 X, y = X[:n_trajectory], y[:n_trajectory] # only use the first few plays ########
from image import Image from preprocess import Preprocess from classifier import Classifier from log_loss import log_loss from postprocess import PostProcess genders = Image.genders() d, _ = Image.data() matrix = Preprocess.to_matrix(d) print matrix.shape matrix = Preprocess.remove_constants(matrix) print matrix.shape matrix = Preprocess.scale(matrix) matrix = Preprocess.polynomial(matrix, 2) matrix = Preprocess.scale(matrix) print matrix.shape matrix = matrix.tolist() half = len(matrix)/2 train, cv = matrix[:half], matrix[half:] train_genders, cv_genders = genders[:half], genders[half:] cv_genders = cv_genders[0::4] preds = Classifier.ensemble_preds(train, train_genders, cv) print "Score: ", log_loss(preds, cv_genders)
def testPreprocessWithMentions(self): tweet = "Take a trip to Central City with @MiloVentimiglia and The PET Squad" prep = Preprocess(tweet) result = prep.preprocess() expected = [["take trip central city pet squad", []]] self.assertEqual(result, expected)
def testPreprocessWithContractions(self): tweet = "this isn't real! :(" prep = Preprocess(tweet) result = prep.preprocess() expected = [["not real", []], ["", [":("]]] self.assertEqual(result, expected)
""" Example run, which processes an input at 60s rolling intervals, caclulates mean degree for all intervals, and outputs degrees to a file. Call in terminal from root like: $ python src/main.py /path/to/input.txt /path/to/output.txt Equivalent to: $ ./run.sh if the input is /tweet_input/tweets.txt and the output is /tweet_output/output.txt """ # def main(): if __name__ == '__main__': import sys print(sys.argv) pre = Preprocess(sys.argv[1]) pre.extract() graph_gen = rolled_graph_gen(pre.df) degrees = g_stats(graph_gen, mean_deg, savename=sys.argv[2])
import json from ego import Kriging from preprocess import Preprocess from reg import CovarianceEstimate import numpy as np from scipy.spatial.distance import pdist, cdist, squareform from pyDOE import lhs from scipy.misc import logsumexp import scikits.bootstrap as boot # # # # get data from the game # # delete the parameters if performing first-time or new player. # # Parameters are there to speed up after saving a pkl. pre = Preprocess(pca_model='../eco_full_pca.pkl', all_dat='../all_games.pkl') # pre = Preprocess() # pre.get_json('../alluser_control.json') # uncomment this to create the pkl file needed!! # pre.train_pca() X, y = pre.ready_player_one(2) # MAX: first dimension is number of plays, second is solution space dimension from sklearn.preprocessing import StandardScaler, MinMaxScaler # scale = StandardScaler() scale = MinMaxScaler((-1., 1.)) X = scale.fit_transform(X) total_no_iters = 31 n_trial = 1 dim = 31 bounds = np.array([[-1.,1.]]*31)
from preprocess import Preprocess from featureExtract import FeatureExtract from afinnClassifier import Afinn from emoticons import Emoticons global prep iter = 1 print("Welcome!!!\n") while iter == 1: tweet = raw_input("Please enter a tweet to be analyzed: ") prep = Preprocess(tweet) #load Preprocess class #preprocess input data data = prep.preprocess() print data #generate bigrams from preprocessed text for item in data: if not item[0]: item[0] = None else: bigrams = FeatureExtract(item[0]).getBigrams item[0] = bigrams print data A = Afinn() A.classify(data)
dstdir = './' infiles = [ srcdir + 'WikiQA-mz-train.txt', srcdir + 'WikiQA-mz-dev.txt', srcdir + 'WikiQA-mz-test.txt'] corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus(infiles[0], infiles[1], infiles[2]) print('total corpus : %d ...' % (len(corpus))) print('total relation-train : %d ...' % (len(rel_train))) print('total relation-valid : %d ...' % (len(rel_valid))) print('total relation-test: %d ...' % (len(rel_test))) prepare.save_corpus(dstdir + 'corpus.txt', corpus) prepare.save_relation(dstdir + 'relation_train.txt', rel_train) prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid) prepare.save_relation(dstdir + 'relation_test.txt', rel_test) print('Preparation finished ...') preprocessor = Preprocess(word_stem_config={'enable': False}, word_filter_config={'min_freq': 2}) dids, docs = preprocessor.run(dstdir + 'corpus.txt') preprocessor.save_word_dict(dstdir + 'word_dict.txt', True) preprocessor.save_words_stats(dstdir + 'word_stats.txt', True) fout = open(dstdir + 'corpus_preprocessed.txt', 'w') for inum, did in enumerate(dids): fout.write('%s %s %s\n' % (did, len(docs[inum]), ' '.join(map(str, docs[inum])))) fout.close() print('Preprocess finished ...') # dssm_corp_input = dstdir + 'corpus_preprocessed.txt' # dssm_corp_output = dstdir + 'corpus_preprocessed_dssm.txt' word_dict_input = dstdir + 'word_dict.txt' triletter_dict_output = dstdir + 'triletter_dict.txt' word_triletter_output = dstdir + 'word_triletter_map.txt'
from preprocess import Preprocess p = Preprocess() p.preprocessDirectory()