def main(): # Read options with open('options.json', 'r') as inFile: options = json.load(inFile) # Jacob method is designed to print the entire sequence of the protein reference options['pos_range'] = [1, 566] # Import data data = importData(options) # Import protein of reference refProt = reference_retreive(options['refProt']) # Get binding core and binding core positions coreIdxs, coreClass = getBindingCore(options, refProt) # Get PTM positions, type and count seqPTM, vaccSample, PTM_count = mapPTM(data, refProt, options) # Statistical test PTM_stats = statisticalTest(options, seqPTM, vaccSample, refProt) # Create HTML output map2HTML(options, coreIdxs, coreClass, refProt, PTM_stats, seqPTM, vaccSample, PTM_count)
def prepare(file_name, image_path, workspace_dir): with open(file_name, 'w') as fn: for image in sorted(glob.glob(os.path.join(image_path, '*'))): exif_data = get_exif_data(Image.open(image)) lat, lon = get_lat_lon(exif_data) alt, roll, yaw, pitch = xmp(image) #print lon, lat, alt, yaw, pitch, roll st = (os.path.basename(image)) + "," + str(float(lon)) + "," + str( float(lat)) + "," + str(float(alt)) + "," + str( float(yaw)) + "," + str(float(pitch)) + "," + str( float(roll)) + "\n" fn.write(st) all_images, data_matrix = utils.importData(file_name, image_path, workspace_dir) return all_images, data_matrix
def main(): # Read options with open('options.json', 'r') as inFile: options = json.load(inFile) # Import data data = importData(options) # Import protein of reference refProt = reference_retreive(options['refProt']) # Map PTMs PTM_map, vaccSample = map_PTMs(options, data, refProt) # Statistical test PTM_stats = statisticalTest(options, PTM_map, vaccSample, refProt) # Convert to HTML and store map2HTML(PTM_map, refProt, vaccSample, options, PTM_stats)
def main(): # Read options with open('options.json', 'r') as inFile: options = json.load(inFile) # Import data data = importData(options) # Import protein of reference refProt = reference_retreive(options['refProt']) # Count PTMs for each unique sequence seqPTM, seqCount, seqInit, PTM_count = mapSeqPTM(data, refProt, options) # Get binding core and binding core positions coreIdxs = getBindingCore(options, refProt) # Compute HTML document seq2HTML(options, seqPTM, seqCount, seqInit, PTM_count, refProt, coreIdxs)
def main(): # Read options with open('options.json','r') as inFile: options = json.load(inFile) # Import data data = importData(options) # Import protein of reference refProt = reference_retreive(options['refProt']) # Get binding cores and binding core positions coreIdxs, coreClass = getBindingCore(options, refProt) # Map mutations seqMut, vaccSample = mapMutations(data, refProt, options) # Compute Fisher exact test MUT_stats = statisticalTest(options, seqMut, vaccSample, refProt) # Create HTML output map2HTML(options, coreIdxs, coreClass, refProt, MUT_stats, seqMut, vaccSample)
import random, pylab, utils FILENAME = './data/student_survey.csv' data = utils.importData(FILENAME) course = data['Course'] handed = data['Handed'] sex = data['Sex'] data['Verbal'] = [ verbal if verbal != '*' else '0' for verbal in data['Verbal'] ] verbal = [int(verbal) for verbal in data['Verbal']] data['Age'] = [age if age != '*' else 0 for age in data['Age']] age = [float(age) for age in data['Age']] courseSample = random.sample(course, 192) handedSample = random.sample(handed, 192) sexSample = random.sample(sex, 192) verbalSample = random.sample(verbal, 192) ageSample = random.sample(age, 192) p_course = utils.buildPivotTable(course) print(p_course) s_course = utils.buildPivotTable(courseSample) print(s_course) p_handed = utils.buildPivotTable(handed) print(p_handed) s_handed = utils.buildPivotTable(handedSample) print(s_handed) p_sex = utils.buildPivotTable(sex) print(p_sex)
def main(): # Read options with open('options.json') as inFile: options = json.load(inFile) # Set pos range to analyze segement of interest options['pos_range'] = [277, 323] # Import data data = importData(options) # Get glycosylation count, and non-glycosylated sites and non-amidated 277N vaccSample, glycoCount, nonglycoCount, nonglycoSample \ , nonDeamidCount, nonDeamidSample, nonGlDa, nonGlDaSample = getGlycoAmid(options, data) # Compute fisher test fisherResult = fisherTest(glycoCount, vaccSample) # Print print('\n') print('Glycosylation Fisher\'s test result:') for vacc in fisherResult: print(vacc + ' {:.2%} ({})'.format(glycoCount[vacc]/vaccSample[vacc], vaccSample[vacc]) + \ ' vs. PAN {:.2%} ({}): pvalue = {:.2}, oddsratio = {:.2}'.format( \ glycoCount['PAN']/vaccSample['PAN'], vaccSample['PAN'], \ fisherResult[vacc]['pvalue'], fisherResult[vacc]['oddsratio'])) # Print probabilities of deamidation or glycosylation as separate entities which can # apper in different sequences. This method does not allow to compute a Fisher's test, but print('\n') print( 'SCENARIO A: Considering 277N and glycosylation sites as different entities that can occur in separate segments' ) print('Probability of non-deamydated and non-glycosylated in ARP:' + \ '{:.2%}'.format((nonglycoCount['ARP']/nonglycoSample['ARP']) * (nonDeamidCount['ARP']/nonDeamidSample['ARP']))) print('Probability of non-deamydated and non-glycosylated in FOC:' + \ '{:.2%}'.format((nonglycoCount['FOC']/nonglycoSample['FOC']) * (nonDeamidCount['FOC']/nonDeamidSample['FOC']))) print('Probability of non-deamydated and non-glycosylated in PAN:' + \ '{:.2%}'.format((nonglycoCount['PAN']/nonglycoSample['PAN']) * (nonDeamidCount['PAN']/nonDeamidSample['PAN']))) # Print probabilities of sequences that include both 277N and glycosylation sites. # This allows to compute Fisher's test PAN vs ARP and FOC. print('\n') print( 'SCENARIO B: Considering only sequence that included both 277N and glycosylation sites:' ) print('Probability of non-deamydated and non-glycosylated in ARP:' + \ '{:.2%}'.format((nonGlDa['ARP']/nonGlDaSample['ARP']))) print('Probability of non-deamydated and non-glycosylated in FOC:' + \ '{:.2%}'.format((nonGlDa['FOC']/nonGlDaSample['FOC']))) print('Probability of non-deamydated and non-glycosylated in PAN:' + \ '{:.2%}'.format((nonGlDa['PAN']/nonGlDaSample['PAN']))) # Fisher test on SCENARIO B: nonGlDaFisher = fisherTest(nonGlDa, nonGlDaSample) # Print print('\n') for vacc in nonGlDaFisher: print('Fisher\'s test result for SCENARIO B:') print(vacc + ' {:.2%} ({})'.format(nonGlDa[vacc]/nonGlDaSample[vacc], nonGlDaSample[vacc]) + \ ' vs. PAN {:.2%} ({}): pvalue = {:.2}, oddsratio = {:.2}'.format( \ nonGlDa['PAN']/nonGlDaSample['PAN'], nonGlDaSample['PAN'], \ nonGlDaFisher[vacc]['pvalue'], nonGlDaFisher[vacc]['oddsratio']))
def start(): begin = msg('開始處理文件') # 获取各种参数 configFile = open('./config.json', 'r', encoding='utf-8') config = json.loads(configFile.read()) table = config['table'] source = config['source'] data = config['files']['data'] include = config['files']['include'] exclude = config['files']['exclude'] # 连接数据库 connect = mysql.connector.connect(**config['mysql']) # 删除旧表, 创建新表 cursor = connect.cursor() cursor.execute('DROP TABLE IF EXISTS {}'.format(table)) sql = '''CREATE TABLE {} ( `id` int(11) NOT NULL AUTO_INCREMENT, `author` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `dynasty` text COLLATE utf8mb4_unicode_ci NOT NULL, `title` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `rhythmic` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `chapter` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `paragraphs` text COLLATE utf8mb4_unicode_ci NOT NULL, `notes` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `collection` text COLLATE utf8mb4_unicode_ci NOT NULL, `section` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `content` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `comment` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, `tags` text COLLATE utf8mb4_unicode_ci DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;''' cursor.execute(sql.format(table)) # 循环处理json文件 arr = [] l = 0 total = 0 for d in data: if len(include) and d['collection'] not in include: continue if len(exclude) and d['collection'] in exclude: continue res = importData(connect, source, table, d['folder'], d['pattern'], d['dynasty'], d['collection']) l = max(l, len(d['collection'])) if res['count'] is None: arr.append({ 'collection': d['collection'], 'time': res['time'], 'count': '失敗' }) else: arr.append({ 'collection': d['collection'], 'time': res['time'], 'count': res['count'] }) total += res['count'] cursor.close() connect.close() # 最后输出统计信息 end = msg('所有文件處理完畢, 記錄總數: ' + str(total)) msg() for v in arr: count = v['count'] msg('{} 用時 {} {}'.format( v['collection'].ljust(l + l - len(v['collection'])), v['time'], v['count'])) msg('共計用時 ' + getTimeString(begin, end)) msg()
x, y = list(getBatches(X_val, len(X_val), self.n_features))[0] feed_dict = {self.X: x, self.Y_label: y, self.L2_param: L2_parameter} summary = sess.run(summary_merge, feed_dict=feed_dict) val_writer.add_summary(summary, global_step) print(global_step) global_step += 1 b += 1 # Define inference/prediction routine # Run the script DEBUGGING = False if __name__ == "__main__": if DEBUGGING: # Import raw data df = importData(DATA_PATH) # Convert data into usable format by tokenizing formatAllMessages(df) word_freq = wordFrequency(df) word_map = makeWordMappingFromFreqList(word_freq, drop_below=2) tokenizeMessages(word_map, df) # Split into train, validation, and test sets spam = df[df["y"] == 1] ham = df[df["y"] == 0] print(len(spam), "+", len(ham), "=", len(spam)+len(ham)) X_train = balanceDataset(spam.iloc[100:], ham.iloc[100:]) else: # Import raw data df = importData(DATA_PATH) # Convert data into usable format by tokenizing formatAllMessages(df)
if not os.path.exists(tmpDir): os.makedirs(tmpDir) if not os.path.exists(dataDir): os.makedirs(dataDir) mast = MAST('kepler/data_search') kicIds = mast.search({ 'max_records': 3000, # same amount of data without exoplanets 'sci_data_quarter': 0, # avoid duplicate ids })[2:, 0] ## Use this to retrieve published/confirmed planets # mast.setDataSet("kepler/published_planets") # confirmedPlanetKicIds = np.unique(mast.search({ # "max_records": 200 # })[2:,1]) for i in range(len(kicIds)): print("%i/%i" % (i + 1, len(kicIds))) kicId = str(kicIds[i]).zfill(9) if dataExists(kicId): continue utils.importData(tmpDir, kicId) processData(kicId, 0) deleteTempFiles(kicId) print("Data import finished\n")
return stree[u'class'] def getRandomForestClass(self, data): sureclass = {} if self.RootNode == None: raise ('There is no Random Forest! ') else: for key, value in self.RootNode.items(): tclass = self.getClass(value, data) sureclass[tclass] = sureclass.get(tclass, 0) + 1 ans = max(sureclass.items(), key=lambda x: x[1]) return ans[0] if __name__ == '__main__': dataset = utils.importData(sys.path[0] + '/data/wine.txt') rf = randomforest(dataset=dataset, treeNum=10) rf.getRandomForest(name='test1') print rf.RootNode rf.saveTree('test1') rf.readTree(sys.path[0] + '/tree_json/test1.txt') print rf.RootNode