def main(**kwargs): kwargs = parse_cl(sys.argv[1:]) initialize_logger(kwargs['args'].out, kwargs['args'].debug, kwargs['args'].print_debug) logger = mylog.getLogger(__name__) start = time.time() if "prepare" in kwargs: logger.info("Run prepare") prepare(kwargs["args"]) elif "cluster" in kwargs: logger.info("Run cluster") cluster(kwargs["args"]) elif "report" in kwargs: logger.info("Run report") report(kwargs["args"]) elif "predict" in kwargs: logger.info("Run predictions") predictions(kwargs["args"]) elif "explore" in kwargs: logger.info("Run explore") explore(kwargs["args"]) elif "stats" in kwargs: logger.info("Run stats") stats(kwargs["args"]) elif "collapse" in kwargs: logger.info("Run collapse") collapse_fastq(kwargs["args"]) elif "simulator" in kwargs: logger.info("Run simulator") # function to simulator logger.info('It took %.3f minutes' % ((time.time()-start)/60))
def get_test_data(max_size, n_cats): with open('word_to_index_top_30000.json', 'r') as f: d = json.load(f) data = [] labels = [] print("collecting test data and labels..") with open('test_data.csv', 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile) for r in reader: words = preprocess_string(r[0], CUSTOM_FILTERS) nums = [0] * len(words) for i, word in enumerate(words): if word in d: nums[i] = d[word] data.append(nums) labels.append(r[-1]) print("collected test data and labels succesfully.") print("preparing test data and labels..") x_test, y_test = prepare(X=data[1:], y=labels[1:], max_size=max_size, n_cats=n_cats, shuffle_data=True) print("prepared test data and labels succesfully.") return x_test, y_test
def get_train_data(max_size, n_cats): data = [] labels = [] print('collecting training data..') with open('data.csv', 'r') as f: for row in csv.reader(f): nums = [0] * len(row) for i, d in enumerate(row): nums[i] = int(d) data.append(nums) f.close() print('collected training data successfully.') print('collecting training labels..') with open('labels.csv', 'r') as f: for row in csv.reader(f): labels.append(int(row[0])) f.close() print('collected training labels successfully.') print("preparing training data and labels..") x_train, y_train = prepare(X=data, y=labels, max_size=max_size, n_cats=n_cats, shuffle_data=True) print("prepared training data and labels successfully.") return x_train, y_train
def main(**kwargs): kwargs = parse_cl(sys.argv[1:]) initialize_logger(kwargs['args'].out, kwargs['args'].debug, kwargs['args'].print_debug) logger = mylog.getLogger(__name__) start = time.time() if "prepare" in kwargs: logger.info("Run prepare") prepare(kwargs["args"]) elif "cluster" in kwargs: logger.info("Run cluster") cluster(kwargs["args"]) elif "report" in kwargs: logger.info("Run report") report(kwargs["args"]) elif "predict" in kwargs: logger.info("Run predictions") predictions(kwargs["args"]) elif "target" in kwargs: logger.info("Run target annotation") targets_enrichment(kwargs["args"]) elif "seqbuster" in kwargs: logger.info("Run seqbuster") miraligner(kwargs["args"]) elif "explore" in kwargs: logger.info("Run explore") explore(kwargs["args"]) elif "stats" in kwargs: logger.info("Run stats") stats(kwargs["args"]) elif "collapse" in kwargs: logger.info("Run collapse") collapse_fastq(kwargs["args"]) elif "simulator" in kwargs: logger.info("Run simulator") simulate(kwargs["args"]) logger.info('It took %.3f minutes' % ((time.time() - start) / 60))
def train(data_dir, review_dir, embedding_dir, model_dir): """ This module uses natural language toolkit (nltk) to divide reviews into single words. Based on that gensim model is trained in order to provide embedding vectors. """ embedding_dim = 100 x_train, x_test, _, _ = prepare(data_dir) all_reviews = x_test + x_train review_lines = [] counter = 0 for line in all_reviews: tokens = word_tokenize(line) tokens = [w.lower() for w in tokens] table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] words = [word for word in stripped if word.isalpha()] stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words] review_lines.append(words) counter += 1 if counter % 10000 == 0: print(counter, '/', len(all_reviews)) with open(review_dir, 'w') as f: wr = csv.writer(f) wr.writerows(review_lines) print(review_lines[0]) print(review_lines[3]) print(len(review_lines)) model = gensim.models.Word2Vec(sentences=review_lines, size=embedding_dim, window=5, workers=4, min_count=10) words = list(model.wv.vocab) print('Vocabulary size: %d' % len(words)) model_dir = model_dir model.save(model_dir) filename = embedding_dir model.wv.save_word2vec_format(filename, binary=False)
import prepare_data import pandas from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import KFold from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier import numpy as np titanic = prepare_data.prepare("train.csv") titanic_test = prepare_data.prepare("test.csv") kf = KFold(titanic.shape[0], n_folds=3, random_state=1) algorithms = [ [RandomForestClassifier(random_state=1, n_estimators=10000, min_samples_split=5, min_samples_leaf=2), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"]], [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]] ] predictions = [] for train, test in kf: train_target = titanic["Survived"].iloc[train] full_test_predictions = [] for alg, predictors in algorithms: alg.fit(titanic[predictors].iloc[train,:], train_target) test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1] full_test_predictions.append(test_predictions)
if j == inp.num_of_relay_positions: print("|", end='|') print() if i == inp.num_of_relay_positions: print("---" * 20) print("========") print('optimal value = ', solver.Objective().Value()) print() print("Time = ", solver.WallTime(), " milliseconds") return dict_constant["l"] * solver.Objective().Value( ), connect_matrix_result, solver.WallTime() if __name__ == '__main__': _dict_constant, _data_path = parse_config() logger = init_log() paths = glob.glob(_data_path) print(paths) # paths.reverse() for path in paths: logger.info("input path %s: ", path) _inp, _is_adj_matrix, _distance_matrix = prepare(path) result, connect_matrix, t = solve_by_or_tools(_inp, _is_adj_matrix, _distance_matrix, _dict_constant) # logger.info("Connected Matrix: \n%s", connect_matrix) logger.info("Result: %s", result) logger.info("Time: %s", t)
def train_network(data_dir, review_dir, embedding_dir, models_dir, logs_dir, batch, epochs, transfer): """ In this function structure of neural network is defined. All the training takes place here based on provided data. """ embedding_dim = 100 _, _, y_train, y_test = prepare(data_dir) embedding_matrix, tokenizer_obj, num_words = use(embedding_dir) with open(review_dir, 'r') as f: reader = csv.reader(f) review_lines = list(list(rec) for rec in csv.reader(f, delimiter=',')) avg_lenght = 0 for i in review_lines: avg_lenght += len(i) avg_lenght = int(avg_lenght / len(review_lines) + 100) print('Data prepared') print('') model = Sequential() if transfer: embedding_layer = Embedding(num_words, embedding_dim, input_length=avg_lenght, trainable=False) embedding_layer.build((None,)) embedding_layer.set_weights([embedding_matrix]) else: embedding_layer = Embedding(num_words, embedding_dim, input_length=avg_lenght, trainable=True) model.add(embedding_layer) model.add(Flatten()) model.add(Dense(256, activation='tanh')) model.add(Dropout(0.5)) model.add(Dense(2, activation='sigmoid')) adam = optimizers.adam(lr=0.001) model.compile(loss="binary_crossentropy", optimizer=adam, metrics=['accuracy']) checkpointer = ModelCheckpoint(filepath=models_dir, verbose=1, save_best_only=False, save_weights_only=False, period=1) tensorboard = TensorBoard(log_dir=logs_dir.format(time())) x_train = review_lines[200000:] x_test = review_lines[:200000] x_shuffle_train = list(zip(x_train, y_train)) random.shuffle(x_shuffle_train) x_train, y_train = zip(*x_shuffle_train) x_train = list(x_train) y_train = list(y_train) x_shuffle_test = list(zip(x_test, y_test)) random.shuffle(x_shuffle_test) x_test, y_test = zip(*x_shuffle_test) x_test = list(x_test) y_test = list(y_test) y_train = to_categorical(y_train, num_classes=2) y_test = to_categorical(y_test, num_classes=2) training_batch_generator = Generator(batch, x_train, y_train, tokenizer_obj, avg_lenght) validation_batch_generator = Generator(batch, x_test, y_test, tokenizer_obj, avg_lenght) print('Model prepared, start training...') model.fit_generator(generator=training_batch_generator, steps_per_epoch=(300000 // batch), epochs=epochs, verbose=1, validation_data=validation_batch_generator, validation_steps=(200000 // batch), use_multiprocessing=False, max_queue_size=1, callbacks=[checkpointer, tensorboard]) if transfer: model.layers[0].trainable = True sgd = optimizers.sgd(lr=0.00001) model.compile(loss="binary_crossentropy", optimizer=sgd, metrics=['accuracy']) model.fit_generator(generator=training_batch_generator, steps_per_epoch=(300000 // batch), epochs=2, verbose=1, validation_data=validation_batch_generator, validation_steps=(200000 // batch), use_multiprocessing=False, max_queue_size=1, callbacks=[checkpointer, tensorboard])
import prepare_data import pandas from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import KFold from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier import numpy as np titanic = prepare_data.prepare("train.csv") titanic_test = prepare_data.prepare("test.csv") kf = KFold(titanic.shape[0], n_folds=3, random_state=1) algorithms = [[ RandomForestClassifier(random_state=1, n_estimators=10000, min_samples_split=5, min_samples_leaf=2), [ "Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", "FamilyId" ] ], [ LogisticRegression(random_state=1), [ "Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked" ] ]]
for j in range(T): this_dset[j] = views[i][j] dsets.append(this_dset) """ colors = [] for article in p[:10]: colors.append(colorsys.hsv_to_rgb(0.588, 0.2, random.uniform(0.4, 0.7))) # stacked_graph(dsets, baseline_fn = min_weighted_wiggles, color_seq='random') sg = pystreamgraph.StreamGraph(views, colors=colors, labels=escaped_p) sg.draw("generated_figure.svg", "MH370 related articles", show_labels=True, width=1800, height=8400) # pl.savefig('generated_figure.png') # pl.show() if __name__ == '__main__': nicknames = ['olympics', 'mh370', 'ebola'] for nickname in nicknames: p = pickle.load(open('../page/' + nickname + '_cluster.pickle')) views = prepare_data.prepare(p) v = np.array(views) v2 = np.fliplr(np.rot90(v.copy(),-1)) template = open('streamgraph.js/template.tpl') t = template.read() t = t.replace("<DATA>", str(v2.tolist())) t = t.replace("<TITLES>", str(p)) h = open('streamgraph.js/data/' + nickname + '.js', "w") h.write(t) h.close()
import sys, getopt, random import os import pandas as pd from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType import pyspark.sql.functions as f from pyspark.ml.stat import Correlation from pyspark.ml.feature import VectorAssembler import seaborn as sns import matplotlib.pyplot as plt import numpy as np from pyspark.sql.functions import percent_rank from pyspark.sql import Window from prepare_data import prepare from prepare_data import train_val_test_split from pyspark.mllib.tree import RandomForest filepath = "hdfs:/user/ct2522" data =prepare(filepath) train,val,test=train_val_test_split(data) train_col=train.columns train_col.remove("Popularity") rf = RandomForest.trainClassifier(train) rf.predict(val.drop("Popularity"))