def main(): seq = 1 data = ReadData(dsName='airsim', subType='mr', seq=seq) barNames = data.getNewImgNames(subtype='bar') pinNames = data.getNewImgNames(subtype='pin') dirBar = data.path + '/images_bar' dirPin = data.path + '/images_pin' if not os.path.exists(dirBar): os.makedirs(dirBar) if not os.path.exists(dirPin): os.makedirs(dirPin) N = data.imgs.shape[0] for i in range(0, N): img = data.imgs[i] img = np.reshape(img, (360, 720, 3)) pin = cv2.fisheye.undistortImage(img, K, D=D_pincus, Knew=K_pincus) bar = cv2.fisheye.undistortImage(img, K, D=D_barrel, Knew=K_barrel) # cv2.imshow('input', img) # cv2.imshow('pin', pin) # cv2.imshow('bar', bar) # cv2.waitKey(1) cv2.imwrite(barNames[i], bar * 255.0) cv2.imwrite(pinNames[i], pin * 255.0) print(i / N)
def main(): filenames = ReadData.listStdDir(CSV_FILEPATH) for filename in filenames: print filename tweets = ReadData.CSVFileToMatrix(filename) for tweet in tweets: processTweet(tweet)
def ReadAndExtractAll(fname='../data/features_all_v2.2.pkl'): ''' read all data, extract features, write to dill ''' short_pid, short_data, short_label = ReadData.ReadData( '../../data1/short.csv') long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv') QRS_pid, QRS_data, QRS_label = ReadData.ReadData('../../data1/QRSinfo.csv') center_waves = ReadData.read_mean_wave( '../../data1/center_wave_euclid_direct.csv') all_pid = QRS_pid all_feature = GetAllFeature(short_data, long_data, QRS_data, long_pid, short_pid, center_waves) all_label = QRS_label print('ReadAndExtractAll done') print('all_feature shape: ', np.array(all_feature).shape) # with open(fname, 'wb') as output: # dill.dump(all_pid, output) # dill.dump(all_feature, output) # dill.dump(all_label, output) return
def __init__(self, filename): points = RD.obj_vertices(filename) normals = RD.obj_normals(filename) self.pwns = [points[i] + normals[i] for i in range(len(points))] sorted_pwns = Lc.locate(self.pwns) self.plates = [] for key in sorted_pwns: self.plates.append(Plate.plate(key, sorted_pwns[key])) self.intersect_matrix = np.matrix([[0, 0, 1], [0, 0, 1], [1, 1, 0]]) n = 10 self.inter_segments = [] self.inter_segment_markers = [] for i in range(len(self.plates)): for j in range(i + 1, len(self.plates)): if self.intersect_matrix[i, j] == 1: segment = Bd.intersect_of_plate(self.plates[j], self.plates[i]) Bd.sub_segment(segment, n) self.inter_segments.append(segment) self.plates[i].add_segment(segment) self.plates[j].add_segment(segment) self.inter_segment_markers.append( [[i, len(self.plates[i].segments) - 1], [j, len(self.plates[j].segments) - 1]])
def ReadAndExtractAll(fname='../data/features_all_v2.5.pkl'): ''' read all data, extract features, write to dill ''' short_pid, short_data, short_label = ReadData.ReadData( '../../data1/short.csv') long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv') QRS_pid, QRS_data, QRS_label = ReadData.ReadData('../../data1/QRSinfo.csv') center_waves = ReadData.read_mean_wave('../../data1/centerwave_raw.csv') all_pid = QRS_pid feature_list, all_feature = GetAllFeature(short_data, long_data, QRS_data, long_pid, short_pid, center_waves) all_label = QRS_label print('ReadAndExtractAll done') print('all_feature shape: ', np.array(all_feature).shape) print('feature_list shape: ', len(feature_list)) np.nan_to_num(all_feature) with open(fname + '_feature_list.csv', 'w') as fout: for i in feature_list: fout.write(i + '\n') with open(fname, 'wb') as output: dill.dump(all_pid, output) dill.dump(all_feature, output) dill.dump(all_label, output) print('write done') return
def read_data(): long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv') # mat1 = [truncate_long(ts, 9000) for ts in long_data] # mat2 = [truncate_long(ts, 6000) for ts in long_data] mat3 = [truncate_long(ts, 3000) for ts in long_data] # mat4 = [sample_long(ts, 10) for ts in mat1] # mat5 = [sample_long(ts, 10) for ts in mat2] # mat6 = [sample_long(ts, 10) for ts in mat3] label_onehot = ReadData.Label2OneHot(long_label) # plt.plot(mat1[0]) # plt.plot(mat4[0]) mat = mat3 all_feature = np.array(mat, dtype=np.float32) all_label = np.array(label_onehot, dtype=np.float32) kf = StratifiedKFold(n_splits=5, shuffle=True) for train_index, test_index in kf.split(all_feature, long_label): train_data = all_feature[train_index] train_label = all_label[train_index] test_data = all_feature[test_index] test_label = all_label[test_index] break train_data = np.expand_dims(np.array(train_data, dtype=np.float32), axis=2) test_data = np.expand_dims(np.array(test_data, dtype=np.float32), axis=2) return train_data, train_label, test_data, test_label
def test_exponential(self): import numpy as np retval = ReadData.exponential(0, 0) self.assertEqual(retval, 1) retval = ReadData.exponential(np.array([0, 0]), 1) self.assertEqual(retval[0], 1) self.assertEqual(retval[1], 1)
def init(): start = time.time() setTextMtime() ReadData.init() getDcit6() end = time.time() return "It run time is : %.03f seconds" % (end - start)
def main(): timer = time.time() # Read both the training and the test data ratings, users, books = r.PandaReader("./data/BX-Book-Ratings-Train.csv", "./data/BX-Users.csv", "./data/BX-Books.csv") testRatings = r.ReadTest("./data/BXBookRatingsTest.csv", users, books) print("Read data time: ", time.time() - timer) timer = time.time() # Construct the dictionaries necessary userRatingMap, bookRatingMap = knn.ConstructTrainModel(ratings) userRatingTestMap, bookRatingTestMap = knn.ConstructTrainModel(testRatings) print("matrix creation time: ", time.time() - timer) timer = time.time() # Choose similarity function = 'Cor' # Validate or test ''' f = open("wknn" + function , "w") f.write("k" + " threshold" + " mae" + " time" + " weight" + "\n") min = [0, 0] for k in range(1, 50, 2): print("K = ", k) for split in range(0, 5): print("split = ", split) timer = time.time() sim, mae = knn.ValidateData(userRatingMap, bookRatingMap, split_1=split * int(len(userRatingMap) / 5), split_2=(split + 1) * int(len(userRatingMap) / 5), k=k, function=function, threshold=8, weighted=True) min[0] += mae min[1] += time.time() - timer print("Test k time: ", time.time() - timer) min[0] /= 5 min[1] /= 5 f.write("%d %d %.2f %.2f %d\n" % (k, 8, min[0], min[1], 1)) min[0] = min[1] = 0 ''' sim, mae = knn.TestData(userRatingMap, userRatingTestMap, bookRatingMap, k=3, function=function, threshold=0, weighted=True) print("Validation time: ", time.time() - timer)
def train(): x = tf.placeholder(tf.float32, [None, 224,224,3]) y = tf.placeholder(tf.float32, [None, 5]) train_data,train_label=ReadData.get_outorder_data_train(train_dir) test_data, test_label = ReadData.get_outorder_data_test(test_dir) #logits = inference(x,True,regularizer) #logits = alexnet(x, 0.5, 5) #logits = vgg_net(x) logits = VGGNet_11(x, 0.5) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y) loss = tf.reduce_mean(loss, name='loss') train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss) correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), tf.cast(tf.argmax(y, 1), tf.int32)) acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) saver = tf.train.Saver() # Tensorboard filewriter_path = 'tensorboard' tf.summary.scalar('loss', loss) tf.summary.scalar('accuracy', acc) merged_summary = tf.summary.merge_all() #merge_summary = tf.summary.merge([loss_summary, acc_summary]) writer = tf.summary.FileWriter(filewriter_path) #config = tf.ConfigProto() #config.gpu_options.allow_growth = True f=open("loss_acc.txt","a+") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #saver.restore(sess, MODEL_SAVE_PATH+MODEL_NAME) for i in range(60): # 20000 train_loss, train_acc, n_batch = 0, 0, 0 for train_data_batch, train_label_batch in ReadData.get_batch(train_data, train_label, BATCH_SIZE): _, err= sess.run([train_op, loss], feed_dict={x: train_data_batch, y: train_label_batch}) train_loss += err n_batch += 1 print(i," train loss: %f" % (np.sum(train_loss) / n_batch)) saver.save(sess, MODEL_SAVE_PATH+MODEL_NAME) test_acc= 0 test_acc = sess.run(acc, feed_dict={x: test_data, y: test_label}) print(i," test acc: %f" % test_acc) new_context = str(i)+" " +str((np.sum(train_loss) / n_batch))+" " +str(test_acc)+ '\n' f.write(new_context) result = sess.run(merged_summary, feed_dict={x: test_data, y: test_label}) writer.add_summary(result, i) writer.close() f.close()
def test(): appointMap = ReadData.getAppointMap() for user in appointMap: lastestTime = ReadData.get_user_lastest_time(user, appointMap) userDict = ReadData.get_recent_user_classId_Dict( lastestTime, appointMap) itemUser = getItemUserMap(userDict) print 'userID : ', user print 'his classID : ', userDict[user] print recommendByUserFC(userDict, itemUser, user, 8) print '=========================================='
def test(): appointMap = ReadData.getAppointMap() for userID in appointMap: lastestTime = ReadData.get_user_lastest_time(userID, appointMap) print userID, '=========================' history_classID, col_recom, apriori_recom = getCandidate( lastestTime, userID, appointMap) print 'history : ', history_classID print 'col_recom : ', col_recom print 'apriori_recom : ', apriori_recom print 'hour : ', getHourOfCandidate(lastestTime, userID, appointMap) print 'storeID : ', getStoreOfCandidate(lastestTime, userID, appointMap)
def lstm_raw(): print('Loading data...') folder_path = r'H:\network_diagnosis_data\cut-1000' X_t, y_t,dicc=ReadData.ReadRaw2HierData(folder_path,N) nb_classes = np.max(y_t)+1 X_t = ReadData.to_num(X_t,max_features) X_train, X_test, y_train, y_test = train_test_split(X_t,y_t, test_size=0.2, random_state= 42 ) print('Pading sequences ') X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#padding = 'post' X_test = sequence.pad_sequences(X_test, maxlen=maxlen)#truncating = 'post' y_train = to_categorical (y_train,nb_classes) y_test = to_categorical (y_test,nb_classes) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Building model...') model = Sequential() model.add(Embedding(max_features, Embedding_Dim, dropout=0.2)) model.add(LSTM(Embedding_Dim, dropout_W=0.2, dropout_U=0.2)) # model.add(Dense(nb_classes)) model.add(Activation('softmax')) # try using different optimizers and different optimizer configs model.compile(loss='categorical_crossentropy', #binary_crossentropy optimizer='adam', metrics=['accuracy']) from keras.utils.visualize_util import plot plot(model, to_file=r'.\data\lstm-model.png') print('Training...') # model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch, # validation_data=(X_test, y_test)) model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch, validation_split=0.1,verbose=1) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size) print('Test score:', score) print('Test accuracy:', acc) from keras.utils.visualize_util import plot data_today=time.strftime('%Y-%m-%d',time.localtime(time.time())) plot(model, to_file=r'.\data\lstm-model'+data_today+'.png') json_string = model.to_json() #等价于 json_string = model.get_config() open('.\data\lstm-model'+data_today+'.json','w+').write(json_string) model.save_weights('.\data\keras-lstm'+data_today+'.h5', overwrite=True) print('model saved')
def run(self, name, datafiles, goldnet_file): import numpy os.chdir(os.environ["gene_path"]) datastore = ReadData(datafiles[0], "steadystate") for file in datafiles[1:]: datastore.combine(ReadData(file, "steadystate")) datastore.normalize() settings = {} settings = ReadConfig(settings) # TODO: CHANGE ME settings["global"]["working_dir"] = os.getcwd() + '/' # Setup job manager print "Starting new job manager" jobman = JobManager(settings) # Make GENIE3 jobs genie3 = GENIE3() genie3.setup(datastore, settings, name) print "Queuing job..." jobman.queueJob(genie3) print jobman.queue print "Running queue..." jobman.runQueue() jobman.waitToClear() print "Queue finished" job = jobman.finished[0] print job.alg.gene_list print job.alg.read_output(settings) jobnet = job.alg.network print "PREDICTED NETWORK:" print job.alg.network.network print jobnet.original_network if goldnet_file != None: goldnet = Network() goldnet.read_goldstd(goldnet_file) print "GOLD NETWORK:" print goldnet.network print jobnet.analyzeMotifs(goldnet).ToString() print jobnet.calculateAccuracy(goldnet) return jobnet.original_network
def load_train_test_data(self, train_data, test_data): """ :param train_data: train data(raw data) is file we wanted our algorithm to train with so we can use that result with test data. :param test_data: test data(raw data) for checking that our prediction is right or not and finding the accuracy. :return: well formed train and test data with having rows as one image and index is label of the image. """ try: # next line will give you transposed and well formatted train data. train_data = ReadData.load_data(train_data) # next line will give you transposed and well formatted test data. test_data = ReadData.load_test_data(test_data) return train_data, test_data except Exception as e: print(e)
def slide_and_cut(tmp_data, tmp_label, tmp_pid): out_pid = [] out_data = [] out_label = [] window_size = 6000 cnter = {'N': 0, 'O': 0, 'A': 0, '~': 0} for i in range(len(tmp_data)): #print(tmp_label[i]) if cnter[tmp_label[i]] is not None: cnter[tmp_label[i]] += len(tmp_data[i]) stride_N = 500 stride_O = int(stride_N // (cnter['N'] / cnter['O'])) stride_A = int(stride_N // (cnter['N'] / cnter['A'])) stride_P = int(0.85 * stride_N // (cnter['N'] / cnter['~'])) stride = {'N': stride_N, 'O': stride_O, 'A': stride_A, '~': stride_P} for i in range(len(tmp_data)): tmp_stride = stride[tmp_label[i]] tmp_ts = tmp_data[i] for j in range(0, len(tmp_ts) - window_size, tmp_stride): out_pid.append(tmp_pid[i]) out_data.append(tmp_ts[j:j + window_size]) out_label.append(tmp_label[i]) out_label = ReadData.Label2OneHot(out_label) out_data = np.expand_dims(np.array(out_data, dtype=np.float32), axis=2) out_label = np.array(out_label, dtype=np.float32) out_pid = np.array(out_pid, dtype=np.string_) return out_data, out_label, out_pid
def processResponses(rawData, headerID, questionTextHeader, filters=None, minAnsweredFrac=0.5, keepDisconnected=False, doRnd=False, weightByFreq=False): # read in and prepare raw data print("Reading and preparing raw data, minFrac = " + str(minAnsweredFrac)) data = rd.readQuestionAndResponseData(rawData, questionTextHeader, doRound=doRnd) data['questions']['questions'] = data['questions']['questions'].set_index( rd.headerQuestion) data['responses'] = data['responses'].set_index(headerID) if filters == None: filters = [responseFilterFlag] # response data contains filtered data used in subsequent analysis print("Building response data") responseData = buildResponseData(data, minAnsweredFrac, keepDisconnected, filters, headerID, True) if len(responseData['responseVectors']) == 0: return None #print(responseData['responseVectors']) # compute similarity matrix print("Computing similarity matrix") print("Building feature vectors") buildFeatureVectors(responseData, {}, [], weightByFreq=weightByFreq) #print(responseData['featureVectors']) print("Computing feature similarity") simData = computePairwiseFeatureSimilarities(responseData) return {'data': data, 'responseData': responseData, 'simData': simData}
def get_virus_host_graph(virus): #Specify virus as either 'HIV' or 'SARS-CoV-2' S2E = ReadData.load_S2E() VHG = nx.Graph() #virus-host graph if virus == 'SARS-CoV-2': f = '%s/SARS-COV2/bait_prey_high_confidence.xlsx' % data_dir Gordon_table = pd.read_excel(f) n = Gordon_table.shape[0] - 1 #first line is header for i in range(n): Bait = list(Gordon_table.loc[i + 1])[0] #skipping first line Prey = list(Gordon_table.loc[i + 1])[2] try: Prey = S2E[Prey] except KeyError: continue VHG.add_edge(Bait, Prey) if virus == 'HIV': f = '%s/HIV/NCBI/HIV-1_physical_interactions.tsv' % data_dir NCBI_table = pd.read_table(f, delimiter='\t') n = NCBI_table.shape[0] - 1 #first line is header for i in range(n): if list( NCBI_table.loc[i + 1] )[3] >= 2: #include only the ones with at leasst two interactions v = list(NCBI_table.loc[i + 1])[0] h = list(NCBI_table.loc[i + 1])[1] VHG.add_edge(v, str(h)) return (VHG)
def run() : global data_path, problem_set, instance_num if request.method == 'POST': print(request.form, flush=True) return else: instance = request.args.get('instance') if instance == "" : return render_template("index.html",x = x,y = y,z = z) J = [] problem_set = instance[:1] instance_num = instance[1:] ReadData(os.path.join(APP_STATIC, data_path + problem_set + "_"+instance_num),J) ga = NSGA(500,3,Job_set = J,common_due_date=120) ga.run() pareto = [item for sublist in ga.nondominated_sort() for item in sublist] input = [[] for _ in range(2)] output = [[]] weight = [] for point in pareto : input[0].append(point.obj[0])#weighted tardiness input[1].append(point.obj[1])#total_flow_time output[0].append(point.obj[2])#pieces weight.append(point.weights) res = (DEA_analysis(input, output)) eff = [r['Efficiency'] for r in res] result["weight"] = weight result["Flow_time"] = input[1] result["Tardiness"] = input[0] result["Piece"] = output[0] result["DEA_score"] = eff return render_template("index.html",result = result)
def searchBestDumping(): datasetMetrics = {} linfoma = ReadData.getDatasets() for df, param in linfoma: print("searching Damping for dataset: ", param.name) metrics = [] path = 'Experiments/SearchingDamping/Affinity_bin/' + param.name affinity_matrix = pd.read_csv(path, header=None) for damp in np.array(range(50, 96, 1)) / 100: if (damp >= 1): damp = 0.99 if ((damp % 0.1) <= 0.01): print(str(int(damp * 200 - 100)), '%') if (affinity_matrix is None): affinity = AffinityPropagationModel(df) affinity.fit(damp, param.affinity['preference']) affinity_matrix = pd.DataFrame(affinity.model.affinity_matrix_) affinity_matrix.to_csv(path, index=False, header=False) else: affinity = AffinityPropagationModel(df) affinity.fit(damp, param.affinity['preference'], 'precomputed', affinity_matrix) metrics.append(affinity.getValidation()) datasetMetrics[param.name] = (pd.DataFrame( metrics, columns=[ 'pureza', 'entropia', 'rand index', 'silhueta', 'davies-bouldin', 'n_groups' ], index=np.array(range(50, 96, 1)) / 100), param) return datasetMetrics
def apriori_classID(lastestTime, appointMap, minSupport=0.1, minConf=0.6): dataSet = ReadData.get_all_recent_class(lastestTime, appointMap) L, suppData = Apriori.apriori(dataSet, minSupport) rules = Apriori.generateRules(L, suppData, minConf) return rules
def read_data(): X = ReadData.read_centerwave('../../data1/centerwave_resampled.csv') _, _, Y = ReadData.ReadData('../../data1/QRSinfo.csv') all_feature = np.array(X) print(all_feature.shape) all_label = np.array(Y) all_label_num = np.array(ReadData.Label2OneHot(Y)) kf = StratifiedKFold(n_splits=5, shuffle=True) i_fold = 1 print('all feature shape: {0}'.format(all_feature.shape)) for train_index, test_index in kf.split(all_feature, all_label): train_data = all_feature[train_index] train_label = all_label_num[train_index] test_data = all_feature[test_index] test_label = all_label_num[test_index] print('read data done') return all_feature, all_label_num, train_data, train_label, test_data, test_label
def expand_three_part(): long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv') kf = StratifiedKFold(n_splits=5, shuffle=True) for train_index, other_index in kf.split(np.array(long_data), np.array(long_label)): train_data = np.array(long_data)[train_index] train_label = np.array(long_label)[train_index] train_pid = np.array(long_pid)[train_index] other_data = np.array(long_data)[other_index] other_label = np.array(long_label)[other_index] other_pid = np.array(long_pid)[other_index] kf_1 = StratifiedKFold(n_splits=2, shuffle=True) for val_index, test_index in kf_1.split(np.array(other_data), np.array(other_label)): val_data = np.array(other_data)[val_index] val_label = np.array(other_label)[val_index] val_pid = np.array(other_pid)[val_index] test_data = np.array(other_data)[test_index] test_label = np.array(other_label)[test_index] test_pid = np.array(other_pid)[test_index] break break train_data_out, train_label_out, train_data_pid_out = slide_and_cut( list(train_data), list(train_label), list(train_pid)) val_data_out, val_label_out, val_data_pid_out = slide_and_cut( list(val_data), list(val_label), list(val_pid)) test_data_out, test_label_out, test_data_pid_out = slide_and_cut( list(test_data), list(test_label), list(test_pid)) print( len(set(list(train_pid)) & set(list(val_pid)) & set(list(test_pid))) == 0) # with open('../../data1/expanded_three_part_window_6000_stride_500_6.pkl', 'wb') as fout: # pickle.dump(train_data_out, fout) # pickle.dump(train_label_out, fout) # pickle.dump(val_data_out, fout) # pickle.dump(val_label_out, fout) # pickle.dump(test_data_out, fout) # pickle.dump(test_label_out, fout) # pickle.dump(test_data_pid_out, fout) ### use np.save to save larger than 4 GB data fout = open('../../data1/expanded_three_part_window_6000_stride_299.bin', 'wb') np.save(fout, train_data_out) np.save(fout, train_label_out) np.save(fout, val_data_out) np.save(fout, val_label_out) np.save(fout, test_data_out) np.save(fout, test_label_out) np.save(fout, test_data_pid_out) fout.close() print('save done')
def build_three_layer_network(drug_source, virus, target_type, HI=None, VHG=None): S2E = ReadData.load_S2E() if HI == None: HI = ReadData.load_HI() multi_graph = HI.copy() if VHG == None: VHG = get_virus_host_graph(virus) for edge in VHG.edges(): multi_graph.add_edge(edge[0], edge[1]) D2T = ReadData.load_drugbank(target_type=target_type) for drug, targets in D2T.items(): for target in targets: multi_graph.add_edge(drug, str(target)) return (multi_graph)
def expand_all(): long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv') data_out, label_out, pid_out = slide_and_cut(long_data, long_label, long_pid) ### use np.save to save larger than 4 GB data fout = open('../../data1/expanded_all_window_6000_stride_500.bin', 'wb') np.save(fout, data_out) np.save(fout, label_out) fout.close() print('save done')
def read_data(): long_pid, long_data, long_label = ReadData.ReadData( '../../data1/centerwave.csv' ) mat1 = [truncate_long(ts, 9000) for ts in long_data] mat2 = [truncate_long(ts, 6000) for ts in long_data] mat3 = [truncate_long(ts, 3000) for ts in long_data] mat4 = [sample_long(ts, 10) for ts in mat1] mat5 = [sample_long(ts, 10) for ts in mat2] mat6 = [sample_long(ts, 10) for ts in mat3] label_onehot = ReadData.Label2OneHot(long_label) # plt.plot(mat1[0]) # plt.plot(mat4[0]) mat1 = np.expand_dims(np.array(mat1), axis=2) label_onehot = np.array(label_onehot) return mat1, label_onehot
def findOutlier(): train_data, test_data = rd.readData() plt.figure(figsize=(18, 10)) plt.boxplot(x=train_data.values, labels=train_data.columns) plt.hlines([-7.5, 7.5], 0, 40, colors='r') plt.show() train_data = train_data[train_data['V9'] > -7.5] test_data = test_data[test_data['V9'] > -7.5] # train_data.describe() # test_data.describe() return train_data, test_data
def createNormalizedDatasets(): list_data = ReadData.getDatasets() datasets = [] for data, param in list_data[1:2]: print(data) #Scaling the samples to have unit norm normalization = ['l1', 'l2'] for norm in normalization: data_normalized = preprocessing.normalize(data, norm=norm, axis=0, copy=True) name = 'data_normalized_' + norm data_normalized = pd.DataFrame(data_normalized, columns=data.columns) parameters = Parameters(name, 'Data/' + name + '.csv', param.k_clusters, param.eps, param.damping) datasets.append((data_normalized, parameters)) #Mapping data to a defined distribution by quantile transforms distribution = ['uniform', 'normal'] for dist in distribution: data_normalized = preprocessing.quantile_transform( data, axis=0, output_distribution=dist, random_state=Seed, copy=True) name = 'data_distribution_' + dist data_normalized = pd.DataFrame(data_normalized, columns=data.columns) parameters = Parameters(name, 'Data/' + name + '.csv', param.k_clusters, param.eps, param.damping) datasets.append((data_normalized, parameters)) ''' #Mapping data to a defined distribution by power transforms standardize = [True,False] for stand in standardize: data_normalized = preprocessing.power_transform(data, method = 'yeo-johnson', standardize = stand, copy=True) name = 'data_distribution_' + dist data_normalized = pd.DataFrame(data_normalized, columns = data.columns) parameters = Parameters(name, 'Data/'+name+'.csv', param.k_clusters, param.eps, param.damping) datasets.append( (data_normalized, parameters) ) ''' return datasets
def main(): import ReadData filename = r'train_data.txt' print '.........读取数据...........' UsersItems, Items = ReadData.ReadData(filename) print '.........分割数据...........' train, test = ReadData.divideData(UsersItems) print '.............训练推荐............' flag = 0 #0: 表示训练测试, 1:表示生成结果 near_num = 200; top_num = 1 hiddenStates_num = 10; max_iter=30 mPLSA = CpLSA() if flag==0: mPLSA.transformData(train) mPLSA.process(hiddenStates_num, max_iter) simUsers = mPLSA.calSimUsers(near_num) elif flag==1: mPLSA.transformData(UsersItems) mPLSA.process(hiddenStates_num, max_iter)
def main(tweetList): tweetListPreprocessed = [] tweetPreprocessed = "" for tweet in tweetList: if (tweet.text != 'Not Available'): tweetPreprocessed = preprocessText(tweet.text) # Save in new object tweetPre = read.make_tweet(tweet.id, tweet.name, tweet.language, tweetPreprocessed) tweetListPreprocessed.append(tweetPre) return tweetListPreprocessed
def pair2rank(self, pair_pref_prob_samp): # list(Nclass * Nclass)# # borda count # ppps = pair_pref_prob_samp Nclass = len(ppps) for i in range(Nclass): if ppps[i][i] != 0: print "warning: self comparison", i, ppps[i][i] ppps[i][i] = 0 rscore = map(sum, ppps) # print "rscore", rscore rank = ReadData.rankOrder(rscore) return rank
def initializeData(): global CLASS_WORDS global CLASSES global CLASS_DICT global TRAINING_DATA_STATS global training_data global test_data ReadData.shuffleTrainingData() training_data = ReadData.TRAINING_DATA[:800] test_data = ReadData.TRAINING_DATA[800:] for i in range(len(CLASSES)): CLASS_DICT[CLASSES[i]] = Class(CLASSES[i]) for data in training_data: # Tokenize each sentence into words sentence = removeSpecialCharacters(data['sentence']) sentence = nltk.word_tokenize(sentence) sentence = lemmatizeSentence(sentence) doc_num = CLASS_DICT[data['class']].getTotalDocuments() document = Document(doc_num, data['class'], sentence) CLASS_DICT[data['class']].addDocument(document) TRAINING_DATA_STATS.training_docs.append(sentence) for word in sentence: # Have we not seen this word already? CLASS_DICT[data['class']].addToTotalClassWordFreq(word) # Add the word to our words in class list CLASS_DICT[data['class']].documents[doc_num].addToDocsWordFreq( word) # This is frequency so we need to change this part CLASS_DICT[data['class']].addWords([word]) TRAINING_DATA_STATS.addToTotalWordFreq(word) TRAINING_DATA_STATS.addWords([word])
def run(): #ターミナルコマンド cmd = "ls" subprocess.call(cmd, shell=True) #データ読み込みと集計、結果をCSVで出力 rawdatafiles = ReadData.rawfilelisting("../data/updates/YoutubePakistan/") #ReadData.readupdatedata(rawdatafiles) #データチェック # ReadData.checklink() # ReadData.checkupdate() # CSVからグラフへ timerange = 60 number_of_files = int(timerange / 15) DrawGraph.draw(number_of_files) #15分間分のファイル 4つ分読み込んでグラフへ
# # This will generate a file with the linear coefficients from 1 to 5 grams. # # # # _____________________________________________________________________________ # 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset = sys.argv[1] maxNgram = int(sys.argv[2]) filename = os.path.basename(dataset).split('.') tweetList = read.read_tweets_dataset(dataset) # 2-. Pre-process state # Raw data -> tweetList # Clean data -> tweetListPreProcessed tweetListPreProcessed = preprocess.main(tweetList) # 3-. OBTAIN N-GRAMS and Linear Coefficients for i in xrange(5, maxNgram+1): corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1) linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams) # print linearCoefficients file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+') for li in linearCoefficients: file.write(str(i)+"\t"+str(li[0]))
# write data a line per sentence, to temporary file temp = tempfile.NamedTemporaryFile() for obs in data: temp.write(obs + "\n") temp.flush() return temp # Wikipedia data is already in a sentence per line format with open(Globals.WIKI_POS, "w") as pos_file: tag(stanford_cmd, Globals.STANFORD_PATH, Globals.WIKI_TRAIN, pos_file) # Twitter, we will read in data, and write to temporary file # Training data twitter_train_raw = ReadData.readTwitterData(Globals.TWITTER_TRAIN, splitwords=False) twitter_temp_file = write_temp(twitter_train_raw[1]) with open(Globals.TWITTER_TRAIN_POS, 'w') as pos_file: tag(gate_cmd, Globals.GATE_PATH, twitter_temp_file.name, pos_file) twitter_temp_file.close() # Test data twitter_test_raw = ReadData.readTwitterData(Globals.TWITTER_TEST, splitwords= False) twitter_temp_file = write_temp(twitter_test_raw[1]) with open(Globals.TWITTER_TEST_POS, 'w') as pos_file: tag(gate_cmd, Globals.GATE_PATH, twitter_temp_file.name, pos_file) twitter_temp_file.close() # Blog data blog = ReadData.readBlogData(Globals.BLOG_DATA, splitwords = False) blog = [ txt for txt,label in blog ]
def run(self, kofile, tsfile, wtfile, datafiles, name, goldnet_file, normalize=False): os.chdir(os.environ["gene_path"]) knockout_storage = ReadData(kofile, "knockout") print "Reading in knockout data" wildtype_storage = ReadData(wtfile, "steadystate") if datafiles == []: other_storage = None else: other_storage = ReadData(datafiles[0], "steadystate") for file in datafiles[1:]: other_storage.combine(ReadData(file, "steadystate")) timeseries_storage = None if tsfile != None: timeseries_storage = ReadData(tsfile, "timeseries") #for ts in timeseries_storage: #ts.normalize() #if normalize: #knockout_storage.normalize() #wildtype_storage.normalize() #other_storage.normalize() settings = {} settings = ReadConfig(settings) # TODO: CHANGE ME settings["global"]["working_dir"] = os.getcwd() + '/' # Setup job manager print "Starting new job manager" jobman = JobManager(settings) # Make inferelator jobs inferelatorjob = inferelator() inferelatorjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, other_storage, name) print "Queuing job..." jobman.queueJob(inferelatorjob) print jobman.queue print "Running queue..." jobman.runQueue() jobman.waitToClear() print "Queue finished" job = jobman.finished[0] #print job.alg.gene_list #print job.alg.read_output(settings) jobnet = job.alg.network #print "PREDICTED NETWORK:" #print job.alg.network.network print jobnet.original_network if goldnet_file != None: goldnet = Network() goldnet.read_goldstd(goldnet_file) #print "GOLD NETWORK:" #print goldnet.network #print jobnet.analyzeMotifs(goldnet).ToString() print jobnet.calculateAccuracy(goldnet) import AnalyzeResults tprs, fprs, rocs = AnalyzeResults.GenerateMultiROC(jobman.finished, goldnet ) ps, rs, precs = AnalyzeResults.GenerateMultiPR(jobman.finished, goldnet) print "Area Under ROC" print rocs print "Area Under PR" print precs return jobnet.original_network
settings["global"]["experiment_name"] + "-" + t + "/" os.mkdir(settings["global"]["output_dir"]) # Read in configs for this algorithm from dfg4grn import * settings = ReadConfig(settings, "./config/default_values/dfg4grn.cfg") settings = ReadConfig(settings, settings["dfg4grn"]["config"]) data = {} knockouts = {} wildtypes = {} knockdowns = {} multifactorials = {} timeseries_as_steady_state = {} # Loop over the directories we want, reading in the timeseries files data = ReadData(data_file, "kranthi_data") ss_names = open(ss_names).read().splitlines() ts_names = open(ts_names).read().splitlines() pert_names = open(pert_names).read().splitlines() # Filter out the genes we don't want genes_of_interest = open(genes_file).read().splitlines() genes_of_interest = [x.upper() for x in genes_of_interest] data.filter(genes_of_interest) # Read in the legend so we know what the experiment names relate to col_to_exp = {} pert_cond = {} for line in open(legend): line = line.strip() line = line.split(',')
if sys.argv[1] == "dream4100": goldnet.read_goldstd(settings["global"]["dream4100_network_goldnet_file"]) #Get a list of the knockout files ko_file = settings["global"]["dream4100_network_knockout_file"].split() kd_file = settings["global"]["dream4100_network_knockdown_file"].split() ts_file = settings["global"]["dream4100_network_timeseries_file"].split() wt_file = settings["global"]["dream4100_network_wildtype_file"].split() mf_file = settings["global"]["dream4100_network_multifactorial_file"].split() # Read data into program # Where the format is "FILENAME" "DATATYPE" knockout_storage = ReadData(ko_file[0], "knockout") knockdown_storage = ReadData(kd_file[0], "knockdown") timeseries_storage = ReadData(ts_file[0], "timeseries") wildtype_storage = ReadData(wt_file[0], "wildtype") mf_storage = ReadData(mf_file[0], "multifactorial") # Setup job manager jobman = JobManager(settings) # Make BANJO jobs mczjob = MCZ() mczjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, knockdown_storage, "MCZ_Alone") jobman.queueJob(mczjob) clrjob = CLR() clrjob.setup(knockout_storage, settings, "clr_" + t + "_Bins-" + str(6), "plos", 6) jobman.queueJob(clrjob) #cojob = ConvexOptimization() #cojob.setup(knockout_storage, settings, "ConvOpt_T-Plos",None, None, 0.04)
# Usage: python selectionSort.py input # Chen-Yu Li [email protected] # 2014/3/5 ## Standard module import sys, os import numpy as np import re ## User's own module sys.path.append('/home/cli56/scripts') import ReadData ## Read input print "Start loading data..." array=ReadData.loadAscii(sys.argv[1]) print "Finished loading data!" ## set output file name inputPrefix = re.split('\.', sys.argv[1]) inputPrefix_noType = '' for i in range(len(inputPrefix)-1): inputPrefix_noType = inputPrefix_noType + inputPrefix[i] if i < (len(inputPrefix)-2): inputPrefix_noType = inputPrefix_noType + '.' s="_sorted.dat" outputPrefix = inputPrefix_noType+s
# Create date string to append to output_dir t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S") settings["global"]["output_dir"] = settings["global"]["output_dir"] + "/" + \ settings["global"]["experiment_name"] + "-" + t + "/" os.mkdir(settings["global"]["output_dir"]) # Read in the gold standard network # Read in the gold standard network #goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"]) #ko_file, kd_file, ts_file, wt_file, mf_file, goldnet = get_example_data_files(sys.argv[1], settings) # Read data into program # Where the format is "FILENAME" "DATATYPE" dex_storage = ReadData("datasets/RootArrayData/DexRatios.csv", "dex") dexcombined = ReadData("datasets/RootArrayData/DexRatios.csv", "dex") dex_storage2 = ReadData("datasets/RootArrayData/HHO3_DEX_ratios.csv", "dex") cnlo_storage = ReadData("datasets/RootArrayData/Root_CNLO_Krouk.txt", "dex") cnlo_no3_storage = ReadData("datasets/RootArrayData/Root_CNLO_Krouk.txt", "dex") no3_1_storage = ReadData("datasets/RootArrayData/Root_NO3_Wang03.txt", "dex") no3_2_storage = ReadData("datasets/RootArrayData/Root_NO3_Wang04.txt", "dex") no3_3_storage = ReadData("datasets/RootArrayData/Root_NO3_Wang07.txt", "dex") #ts_storage = ReadData("datasets/RootArrayData/Root_WT_Krouk11.txt", "dex") tfs_file = open("datasets/RootArrayData/tfs.csv", 'r') line = tfs_file.readlines()[0] tfs = line.strip().split(',') tfs = [x.upper() for x in tfs]
import numpy as np # import scipy as sp from scipy import interpolate import ReadData import libMATERIALS Ge_table, Ge_start, dU_Ge, N_Ge1, N_Ge2 = libMATERIALS.initialize_ge() TGe = interpolate.interp1d(Ge_table[2, :], Ge_table[0, :], "slinear") Tx = np.zeros((100)) Ex = np.zeros((100)) ct2 = 0 for iter in range(10000, 173000, 1000): ct, L, dL, Ne, qfluxL, qfluxC, qfluxR = ReadData.readdata1(iter) qL, qC, qR, Tavg, Eavg = ReadData.readdata2(iter, Ne[0], Ne[1], Ne[2]) ct2 = ct2 + 1 for i in range(Ne[0]): Ex[i] = Ex[i] + np.sum(Eavg[i, :, :]) / (Ne[1] * Ne[2]) Ex = Ex / ct2 / (dL[0] * dL[1] * dL[2]) Tx = TGe(Ex) x = np.linspace(0.5 * dL[0], L[0] - 0.5 * dL[0], Ne[0]) plt.figure() plt.plot(x, Tx)
''' Created on Jun 8, 2012 @author: yyb ''' 'Write the information into GraphML format files' import os; import ReadData; ReadData.readSubDate(); ReadData.readLinks(); myDayDict = ReadData.dailyDict; myMonDict = ReadData.monthlyDict; myYrDict = ReadData.yearlyDict; myPaperDict = ReadData.paperDict; myCiteDict = ReadData.citeDict; totalDay = len(ReadData.dailyList); totalMon = len(ReadData.monthlyList); totalYr = len(ReadData.yearlyList); #=============================================================================== # Write information into GraphML format files with a daily unit #=============================================================================== def writeDailyGML(size):
wt_file = settings["global"]["medium_network_wildtype_file"].split() # Read in the gold standard network #goldnet = Network() #goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"]) #ko_file = settings["global"]["large_network_knockout_file"].split() #kd_file = settings["global"]["large_network_knockdown_file"].split() #ts_file = settings["global"]["large_network_timeseries_file"].split() #wt_file = settings["global"]["large_network_wildtype_file"].split() # Read data into program # Where the format is "FILENAME" "DATATYPE" knockout_storage = ReadData(ko_file[0], "knockout") knockdown_storage = ReadData(kd_file[0], "knockdown") timeseries_storage = ReadData(ts_file[0], "timeseries") wildtype_storage = ReadData(wt_file[0], "wildtype") wildtype_storage.combine(knockout_storage) wildtype_storage.combine(knockdown_storage) wildtype_storage.combine(timeseries_storage) wildtype_storage.normalize() knockout_storage.normalize() # Setup job manager jobman = JobManager(settings) # Make BANJO jobs
def run(self, datafiles=None, name=None, goldnet_file=None, topd=None, restk=None): import numpy os.chdir(os.environ["gene_path"]) print "Reading in data" data_storage = ReadData(datafiles[0], "steadystate") for file in datafiles[1:]: data_storage.combine(ReadData(file, "steadystate")) settings = {} settings = ReadConfig(settings) # TODO: CHANGE ME settings["global"]["working_dir"] = os.getcwd() + "/" # Setup job manager print "Starting new job manager" jobman = JobManager(settings) # Make nir jobs nirjob = NIR() nirjob.setup(data_storage, settings, name, topd, restk) print "Queuing job..." jobman.queueJob(nirjob) print jobman.queue print "Running queue..." jobman.runQueue() jobman.waitToClear() print "Queue finished" job = jobman.finished[0] print job.alg.gene_list print job.alg.read_output(settings) jobnet = job.alg.network print "PREDICTED NETWORK:" print job.alg.network.network if goldnet_file != None: goldnet = Network() goldnet.read_goldstd(goldnet_file) # print "GOLD NETWORK:" # print goldnet.network # print jobnet.analyzeMotifs(goldnet).ToString() print jobnet.calculateAccuracy(goldnet) import AnalyzeResults tprs, fprs, rocs = AnalyzeResults.GenerateMultiROC( jobman.finished, goldnet, True, job.alg.output_dir + "/ROC.pdf" ) ps, rs, precs = AnalyzeResults.GenerateMultiPR( jobman.finished, goldnet, True, job.alg.output_dir + "/PR.pdf" ) print "Area Under ROC" print rocs print "Area Under PR" print precs return job.alg.network.network
import UtilsTweetSafa as utils import Smoothing as linear import numpy as np import CrossValidation as cv import sys maxNgram = 5 # 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset = "../Dataset/output_complete.txt" test = "../Dataset/mezclado.txt" LI_Coefficients = "../Dataset/LICoefficients_5gram_for-output_complete.txt" tweetList = read.read_tweets_dataset(dataset) tweetListtest = read.read_tweets_dataset(test) # 2-. Pre-process state tweetListPreProcessed = preprocess.main(tweetList) tweetListPreProcessedtest= preprocess.main(tweetListtest) shuffle(tweetListPreProcessed) # Raw data -> tweetList # Clean data -> tweetListPreProcessed #utils.printTweets(tweetListPreProcessed) # 3-. Algorithms # # 3.1-. OBTAIN N-GRAMS
# goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"]) settings["global"]["experiment_name"] = "GENIE3" + sys.argv[1] ko_file, kd_file, ts_file, wt_file, mf_file, goldnet = get_example_data_files( sys.argv[1], settings ) # Create date string to append to output_dir t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S") settings["global"]["output_dir"] = ( settings["global"]["output_dir"] + "/" + settings["global"]["experiment_name"] + "-" + t + "/" ) os.mkdir(settings["global"]["output_dir"]) # Get a list of the multifactorial files # Read data into program # Where the format is "FILENAME" "DATATYPE" mf_storage = ReadData(mf_file[0], "multifactorial") ko_storage = ReadData(ko_file[0], "knockout") kd_storage = ReadData(kd_file[0], "knockdown") wt_storage = ReadData(wt_file[0], "wildtype") # Setup job manager jobman = JobManager(settings) # Make GENIE3 jobs genie3job = GENIE3() genie3job.setup(mf_storage, settings, "MF") jobman.queueJob(genie3job) mf_storage.combine(ko_storage) genie3job = GENIE3() genie3job.setup(mf_storage, settings, "MF_KO")
import ReadData import numpy import pylab from math import * myArray = ReadData.loadAscii("DataSet_PythonStatisticalAnalysis.txt") print "I have read in myArray and the first element is ", myArray[0] def Mean(myArray): for i in myArray: print i # this prints everything in the array. # let's change this to average things in a average = numpy.average(myArray) return average def StandardDev(myArray): # do stuff standard_dev = numpy.std(myArray) return standard_dev def NaiveStandardError(myArray): # calculate the standard error by calling the standard deviation function Neff = len(myArray) sig = StandardDev(myArray) se = sig / (numpy.sqrt(Neff - 1)) return standard_err
""" # wrap application to handle tuple and non-tuple satisfies = lambda f, e: f(e[0]) if isinstance(data[0], tuple) else lambda f, e: f(e) defined_filters = set(filters.keys()) applied = defined_filters.intersection(filter_names) if filter_names != None else defined_filters if not applied: raise ValueError("Must apply at least one filter. try filter_names = None") if filter_names != None and len(applied) != len(filter_names): missing = applied.difference(filter_names) raise ValueError("Unspecified filter names: %s" % ", ".join(missing)) # ok down to business now clean_data = enumerate(data) for filter_name in applied: print "Applying filter: %s" % filter_name filter = filters[filter_name] clean_data = [ (i, obs) for i, obs in clean_data if satisfies(filter, obs) ] if return_indices: return [i for i, _ in clean_data] else: return [obs for _, obs in clean_data] # an example if __name__ == "__main__": import ReadData import Globals wikidata = ReadData.prepareWikiData(Globals.WIKI_TRAIN, splitwords= False) cleandata = clean_wiki(wikidata) print "Reduced data by %f" % (len(cleandata) / float(len(wikidata)))
for gene1 in goldnet.network: for gene2 in goldnet.network[gene1]: if goldnet.network[gene1][gene2] > 0: t.append(gene1) tfs[name] = list(set(t)) goldnet = Network() goldnet.read_goldstd(goldnets[data.keys()[0]]) genie3nets = {} for name in data.keys(): for i in range(50): ts_storage = data[name] settings["global"]["time_series_delta_t"] = (1008.0 / (len(ts_storage[0].experiments)-1)) combined = ReadData(exp_data_directory + '/' + name + '/' + timeseries_filename, "timeseries")[0] for ts in timeseries_as_steady_state[name][1:]: combined.combine(ts) combined.combine(multifactorials[name]) genie3job = GENIE3() genie3job.setup(combined, settings, "Genie3_TimeSeries_SS_{0}-{1}".format(name, i)) jobman.queueJob(genie3job) genie3nets[name] = genie3job jobman.runQueue() jobman.waitToClear() for name in data.keys(): for i in range(50):
__author__ = 'saghar hosseini ([email protected])' import numpy as np from ReadData import* from projection import* ########################################################################################## # Load Data ########################################################################################## # path="C:/Users/sagha_000/Documents/SVN/My_SVN/TimeVaryingSocialNetworks/datasets/as-733/" path="F:/Saghar_SVN/TimeVaryingSocialNetworks/datasets/twitter-pol-dataset/graphs/" dataset=ReadData(path) edges=dataset.read_network_snapshot(1,hasHeader=True) nodes_list=set(edges.keys()) output_path='F:/Saghar_SVN/TimeVaryingSocialNetworks/datasets/twitter-pol-dataset/Results/wo_OPD/' ############################################################################################ # Define Parameters ############################################################################################ numberOfSnapshots=1175 numCommunity=10 mu=0.1 lambdah_C=0.0 lambdah_B=0.0 sampleFraction=0.25 n=len(nodes_list) K_B=1.0 K_C=1.0 ############################################################################################# #variables learning_rate_C={} initial_state=dict() state=dict() visit={}
settings["global"]["experiment_name"] = "GenBio-German-Last-Removed-" + sys.argv[1] settings["global"]["n_processors"] = 1 # Set up output directory t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S") settings["global"]["output_dir"] = settings["global"]["output_dir"] + "/" + \ settings["global"]["experiment_name"] + "-" + t + "/" os.mkdir(settings["global"]["output_dir"]) jobman = JobManager(settings) # Read data into program # Where the format is "FILENAME" "DATATYPE" c4d = ReadData("datasets/German_Data/Caldana-4d.tsv", "dex") c4l = ReadData("datasets/German_Data/Caldana-4L.tsv", "dex") c21d = ReadData("datasets/German_Data/Caldana-21d.tsv", "dex") c21hl = ReadData("datasets/German_Data/Caldana-21HL.tsv", "dex") c21l = ReadData("datasets/German_Data/Caldana-21L.tsv", "dex") c21ll = ReadData("datasets/German_Data/Caldana-21LL.tsv", "dex") c32l = ReadData("datasets/German_Data/Caldana-32L.tsv", "dex") c32l2 = ReadData("datasets/German_Data/Caldana-32L2.tsv", "dex") combined = ReadData("datasets/German_Data/Caldana-4d.tsv", "dex") c21l.experiments = c21l.experiments[1:] #settings["global"]["time_series_delta_t"] = [5,10,20,40,60,80,100,120,140,160,180,200,220,240,260,280,300,320,340,360,640,1280] settings["global"]["time_series_delta_t"] = [5,10,20,40,60,80,100,120,140,160,180,200,220,240,260,280,300,320,340,360,640,1280] #settings["global"]["time_series_delta_t"] = settings["global"]["time_series_delta_t"][:-remove]
firstLine = True j = 0 for line in inputFile[i]: if firstLine: # First line label inputFileLabel[i] = str(line) firstLine = False else: # For every beta point inputLine = str(line) fileExtension = inputLine.replace(' ','-') fileExtension = fileExtension.replace('\n','') filePath = "data/traces/" + varLabel + "Trace-" + fileExtension + ".dat" print "\nReading data from " + filePath + "." (myArray, myArrayHeadings) = ReadData.loadAscii(filePath) yValData[i].append(myArray) xValData[i].append(myArrayHeadings) # Make histograms and generate error bars print "\nSorting Data..." xVals[i].append([]) for k in range(0, len(xValData[i][j])): # For every x value xVals[i][j].append([]) xVals[i][j][k] = float(xValData[i][j][k]) bins[i].append([]) for l in range(0, len(xVals[i][j])): # For every bin in the histogram
def run(self, ko_file, wt_file, ts_file=None, kd_file=None, name=None): import numpy os.chdir(os.environ["gene_path"]) print "Reading in knockout data" knockout_storage = ReadData(ko_file, "knockout") knockout_storage.normalize() wildtype_storage = ReadData(wt_file, "wildtype") wildtype_storage.normalize() knockdown_storage = ReadData(kd_file, "knockdown") knockdown_storage.normalize() wildtype_storage.combine(knockdown_storage) timeseries_storage = None if ts_file != None: timeseries_storage = ReadData(ts_file, "timeseries") for ts in timeseries_storage: ts.normalize() settings = {} settings = ReadConfig(settings) # TODO: CHANGE ME settings["global"]["working_dir"] = os.getcwd() + '/' # Setup job manager print "Starting new job manager" jobman = JobManager(settings) # Make MCZ jobs mczjob = MCZ() mczjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, name) print "Queuing job..." jobman.queueJob(mczjob) print jobman.queue print "Running queue..." jobman.runQueue() jobman.waitToClear() print "Queue finished" job = jobman.finished[0] print job.alg.gene_list print job.alg.read_output(settings) jobnet = job.alg.network print "PREDICTED NETWORK:" print job.alg.network.network print jobnet.original_network return jobnet.original_network
def get_network_results(name, settings, cache): print "STARTING", name if name in cache.keys(): print "CACHE HIT" return cache[name] ko_file, kd_file, ts_file, wt_file, mf_file, goldnet = get_example_data_files(name, settings) # Create date string to append to output_dir t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S") settings["global"]["output_dir"] = settings["global"]["output_dir_save"] + "/" + \ settings["global"]["experiment_name"] + "-" + t + "-" + name + "/" os.mkdir(settings["global"]["output_dir"]) # Get a list of the multifactorial files # Read data into program # Where the format is "FILENAME" "DATATYPE" mf_storage = ReadData(mf_file[0], "multifactorial") knockout_storage = ReadData(ko_file[0], "knockout") knockdown_storage = ReadData(kd_file[0], "knockdown") wildtype_storage = ReadData(wt_file[0], "wildtype") timeseries_storage = ReadData(ts_file[0], "timeseries") gene_list = knockout_storage.gene_list # Setup job manager jobman = JobManager(settings) # MCZ mczjob = MCZ() mczjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, knockdown_storage, "MCZ") jobman.queueJob(mczjob) # CLR clrjob = CLR() clrjob.setup(knockout_storage, settings, "CLR", "plos", 6) jobman.queueJob(clrjob) # GENIE3 mf_storage.combine(knockout_storage) mf_storage.combine(wildtype_storage) mf_storage.combine(knockdown_storage) genie3job = GENIE3() genie3job.setup(mf_storage, settings, "GENIE3") jobman.queueJob(genie3job) ## TLCLR tlclrjob = TLCLR() tlclrjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, knockdown_storage, "TLCLR") jobman.queueJob(tlclrjob) #if sys.argv[1] != "dream4100": #cojob = ConvexOptimization() #cojob.setup(knockout_storage, settings, "ConvOpt_T-"+ str(0.01),None, None, 0.01) #jobman.queueJob(cojob) ### DFG4GRN dfg = DFG4GRN() settings["dfg4grn"]["eta_z"] = 0.01 settings["dfg4grn"]["lambda_w"] = 0.001 settings["dfg4grn"]["tau"] = 3 dfg.setup(timeseries_storage, TFList(timeseries_storage[0].gene_list), settings, "DFG", 20) jobman.queueJob(dfg) ### Inferelator ### NIR nirjob = NIR() nirjob.setup(knockout_storage, settings, "NIR", 5, 5) jobman.queueJob(nirjob) #### TDARACNE settings = ReadConfig(settings, "./config/default_values/tdaracne.cfg") bjob = tdaracne() settings["tdaracne"]["num_bins"] = 4 bjob.setup(timeseries_storage, settings, "TDARACNE") jobman.queueJob(bjob) print jobman.queue jobman.runQueue() jobman.waitToClear(name) SaveResults(jobman.finished, goldnet, settings, name) cache[name] = jobman.finished[:] return cache[name]
multifactorial_filename = exp_set + '-1_multifactorial.tsv' dex_filename = exp_set + '-1_multifactorial.tsv' goldstandard_filename = exp_set + '-1_goldstandard.tsv' ts_only_data = {} ts_pert_data = {} pert_data = {} ko_pert_data = {} # Do TS only first ts_only_data["timeseries"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + timeseries_filename, "timeseries") #knockdowns.normalize() ts_only_data["ss_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + wildtype_filename, "wildtype") #wildtypes.normalize() ts_only_data["multifactorial_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + multifactorial_filename, "multifactorial") ts_only_data["knockout_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + knockout_filename, "knockout") #pert_data = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + multifactorial_filename, "multifactorial") #multifactorials.normalize() ts_only_data["goldnet_file"] = exp_data_directory + "/" + exp_set + "/" + '/TS/' + goldstandard_filename ts_pert_data["timeseries"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + timeseries_filename, "timeseries") #knockdowns.normalize() ts_pert_data["ss_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + wildtype_filename, "wildtype") #wildtypes.normalize() ts_pert_data["multifactorial_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + multifactorial_filename, "multifactorial")
# Read in the gold standard network goldnet = Network() goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"]) ko_file = settings["global"]["large_network_knockout_file"].split() kd_file = settings["global"]["large_network_knockdown_file"].split() ts_file = settings["global"]["large_network_timeseries_file"].split() wt_file = settings["global"]["large_network_wildtype_file"].split() # Read data into program # Where the format is "FILENAME" "DATATYPE" knockout_storage = ReadData(ko_file[0], "knockout") knockdown_storage = ReadData(kd_file[0], "knockdown") timeseries_storage = ReadData(ts_file[0], "timeseries") wildtype_storage = ReadData(wt_file[0], "wildtype") wildtype_storage.combine(knockout_storage) wildtype_storage.combine(knockdown_storage) wildtype_storage.combine(timeseries_storage) # Setup job manager jobman = JobManager(settings) # Make BANJO jobs mczjob = MCZ() mczjob.setup(knockout_storage, wildtype_storage, settings, None, "mcz-test-run-1") jobman.queueJob(mczjob) print jobman.queue
''' Created on Jun 25, 2012 @author: yyb ''' import os; import ReadData; from copy import deepcopy; from datetime import datetime; ReadData.readItemInfo(); ReadData.readUserInfo(); myUsrInfo = ReadData.usrInfo; myItmInfo = ReadData.itmInfo; #=============================================================================== # Defining my own comparison function for sorting filenames in filelist #=============================================================================== def compare(s1, s2): a = int(s1.split('.')[0]); b = int(s2.split('.')[0]); return cmp(a, b); #=============================================================================== # A function for writing given data into GraphML format
t.append(gene1) tfs[name] = list(set(t)) for key in goldnets.keys(): goldnet = Network() goldnet.read_goldstd(goldnets[key]) goldnets[key] = goldnet genie3nets = {} for i in range(20): for name in data.keys(): ts_storage = data[name] settings["global"]["time_series_delta_t"] = (1008.0 / (len(ts_storage[0].experiments)-1)) combined = ReadData(exp_data_directory + '/' + name + '/' + timeseries_filename, "timeseries")[0] for ts in timeseries_as_steady_state[name][1:11]: combined.combine(ts) #combined.combine(knockouts[name]) combined.combine(multifactorials[name]) genie3job = GENIE3() genie3job.setup(combined, settings, "Genie3_TimeSeries_{0}_{1}".format(name, i)) jobman.queueJob(genie3job) genie3nets[name] = genie3job genie3job.goldnet = goldnets[name] jobman.runQueue() jobman.waitToClear()
if line[k]!=' ': entry += line[k] else: params += [entry] entry='' beta[i].append(float(params[3])) nPart = float(params[0]) nD = float(params[1]) #interaction = float(params[11]) fileExtension = inputLine.replace(' ','-') fileExtension = fileExtension.replace('\n','') scalarFilePath = "data/traces/scalarTrace-" + fileExtension + ".dat" scalarFilePath = "data/traces/Energy-7-3-32-2.7366-7200-1-1-1-0.dat" print "\nReading data from " + scalarFilePath + "." (myArray, myArrayHeadings) = ReadData.loadAscii(scalarFilePath) scalarData[i].append(CalcStatistics.getAndOutputStats(myArray, myArrayHeadings)) #Plotting.makePlots(myArray, myArrayHeadings, fileExtension) print "\nSorting Data..." # Rotate Data into Columns col = [] for i in range(0, len(scalarData)): # For every input file col.append([]) for j in range(0, len(scalarData[i][0])): # For every observable col[i].append([]) for k in range(0, len(scalarData[i][0][0])): # For every statistic col[i][j].append([])