def create_app(configfile='mmctl.conf'): app = Flask(__name__) app.version = '0.2.0' app.config.from_object(defaults) try: conffile = os.path.join(app.instance_path, 'mmctl.conf') app.config.from_pyfile(conffile) except IOError: # load configuration blueprint from cfgutil import cfgutil import random app.register_blueprint(cfgutil) # generate initial salt if None == app.config['PBKDF2_SALT']: app.config['PBKDF2_SALT'] = \ '%x' % random.SystemRandom().getrandbits(96) else: # load api/ui blueprint from mmctlui import mmctlui app.register_blueprint(mmctlui) # load slice file, this could be done dynamicall later on app.meta = load_meta(app, app.config['ICE_STRING']) # check if there is a configuration file, if not, run config blueprint AssetEnvironment(app) return app
def get_XY_vectors(): meta, id_to_idx, idx_to_id = utils.load_meta(chosen_meta) all_answers = get_answers_list(meta) Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers]) x = [extract_features_from_body(text) for post_id,text in utils.fetch_posts(chosen) if post_id in all_answers] X = np.asarray(x) return X,Y
def get_photons(amount=1): print('amount = ', amount) alldata = load_meta(kind='pt_outputs', amount=amount) cur_seg, pred_seg_res, cur_data, _, trainbool, astro_dict = alldata[ -amount] del alldata metrics = get_bin_measures(cur_seg, pred_seg_res, sum=False) true_pos, false_neg, false_pos, true_neg = np.sum(metrics, axis=1) print(astro_dict) print(trainbool) print( confusion_matrix(false_neg, true_pos, true_neg, false_pos, true_neg + false_pos, true_pos + false_neg)) all_photons = np.concatenate((cur_data[metrics[0]], cur_data[metrics[1]], cur_data[metrics[2]], cur_data[metrics[3]]), axis=0) star_photons = np.concatenate((cur_data[metrics[1]], cur_data[metrics[3]]), axis=0) planet_photons = np.concatenate( (cur_data[metrics[0]], cur_data[metrics[2]]), axis=0) return all_photons, star_photons, planet_photons
# from sklearn.model_selection import KFold from sklearn import neighbors from data import chosen, chosen_meta from utils import plot_roc, plot_pr from utils import plot_feat_importance from utils import load_meta from utils import fetch_posts from utils import plot_feat_hist from utils import plot_bias_variance from utils import plot_k_complexity start_time = time.time() # question Id -> {'features'->feature vector, 'answers'->[answer Ids]}, 'scores'->[scores]} # scores will be added on-the-fly as the are not in meta meta, id_to_idx, idx_to_id = load_meta(chosen_meta) import nltk # splitting questions into train (70%) and test(30%) and then take their # answers all_posts = list(meta.keys()) all_questions = [q for q, v in meta.items() if v['ParentId'] == -1] all_answers = [q for q, v in meta.items() if v['ParentId'] != -1] # [:500] feature_names = np.array( ('NumTextTokens', 'NumCodeLines', 'LinkCount', 'AvgSentLen', 'AvgWordLen', 'NumAllCaps', 'NumExclams', 'NumImages')) # activate the following for reduced feature space """
# 初始化路径 save_root = os.path.join(opt.checkpoint_dir, opt.tag) log_root = os.path.join(opt.log_dir, opt.tag) utils.try_make_dir(save_root) utils.try_make_dir(log_root) # dataloader train_dataloader = train_dataloader val_dataloader = val_dataloader # 初始化日志 logger = init_log(training=True) # 初始化训练的meta信息 meta = load_meta(new=True) save_meta(meta) # 初始化模型 Model = get_model(opt.model) model = Model(opt, logger) # 暂时还不支持多GPU # if len(opt.gpu_ids): # model = torch.nn.DataParallel(model, device_ids=opt.gpu_ids) model = model.to(device=opt.device) if opt.load: load_epoch = model.load(opt.load) start_epoch = load_epoch + 1 if opt.resume else 1 else:
from sklearn.metrics import precision_recall_curve, roc_curve, auc from sklearn.cross_validation import KFold from sklearn import neighbors from data import chosen, chosen_meta from utils import plot_roc, plot_pr from utils import plot_feat_importance from utils import load_meta from utils import fetch_posts from utils import plot_feat_hist from utils import plot_bias_variance from utils import plot_k_complexity # question Id -> {'features'->feature vector, 'answers'->[answer Ids]}, 'scores'->[scores]} # scores will be added on-the-fly as the are not in meta meta, id_to_idx, idx_to_id = load_meta(chosen_meta) import nltk # splitting questions into train (70%) and test(30%) and then take their # answers all_posts = list(meta.keys()) all_questions = [q for q, v in meta.items() if v['ParentId'] == -1] all_answers = [q for q, v in meta.items() if v['ParentId'] != -1] # [:500] feature_names = np.array(( 'NumTextTokens', 'NumCodeLines', 'LinkCount', 'AvgSentLen', 'AvgWordLen',
def main(): import optparse p = optparse.OptionParser(usage='%prog [options] <src_dir> <db_path>') opts, args = p.parse_args() src_dir, db_path = map(Path, args) assert src_dir.is_dir() assert not db_path.exists() Session, db_engine = make_session_and_engine(db_path) Base.metadata.create_all(db_engine) session = Session() title_regexp = re.compile( r'\s*(\d+)\s*(?:-|–)\s*(?:\w+),?.?\s*(?:“|«|\")?\s*((?:\'|`|[^\d\W])+\s*(?:`|[^\d\W])+)\s*(?:\d*)\s*(?:”|»|\")?', re.UNICODE) BISMILLAH_LITERALS = { 'Мээримдүү, Ырайымдуу Аллахтын аты менен', 'Ырайымдуу, Мээримдүү Аллахтын аты менен' } surah_infos = load_meta() note_id_func = FT.partial(next, infinite_numbers_gen()) def link_positions(text) -> T.Tuple[str, T.List[T.Tuple[int, int]]]: """ 'Some{1} text with links{2} in it{3}!' -> ('Some text with links in it!', [(4, 1), (20, 2), (26, 3)]) """ chunks = re.split( r'(\{\d+\})', text ) # ['Some', '{1}', ' text with links', '{2}', ' in it', '{3}', '!'] text_chunks = [] links = [] for i, ch in enumerate(chunks): if i % 2 == 0: text_chunks.append(ch) else: # (index, link_id) links.append((sum(map(len, text_chunks)), int(ch[1:-1]))) return ''.join(text_chunks), links for findex, p in enumerate( sorted(src_dir.glob('suro_*.html'), key=lambda i: int(''.join(ch for ch in i.name if ch.isdigit())))): with p.open() as f: soup = BeautifulSoup(f.read(), features='lxml') # note_links = soup.find_all('a', {'class': 'sdfootnoteanc'}) # assert len(soup.find_all('div', {'class': 'hidden'})) == 1 notes_container = soup.select_one('div.hidden') def get_link_content(link_el): content_el = notes_container.select_one('div{} > p'.format( link_el['href'][:-3])) # assert len(content_el.find_all('a')) == 1 content_el.select_one('a').decompose() # assert len(content_el.find_all('sup')) == 1, link_el['href'] for el in content_el.select('sup'): el.decompose() return ' '.join(content_el.text.split()) # title parsing title_container = soup.find('div', {'class': 'title-cont'}) main_title = title_container.select_one( 'div.title-parent > div.title-center') # assert len(main_title.select('a')) in (0, 1), p.name title_note_link = main_title.select_one('a') # assert not title_note_link or title_note_link['name'] != '_GoBack' title_note_content = get_link_content( title_note_link) if title_note_link else None # print(p, repr(title_note_content)) # continue title = main_title.text suro_num_literal, title = title_regexp.match(title).groups() title = ' '.join(title.split()) surah_num = int(suro_num_literal) # print(surah_num, title) surah_info = surah_infos[surah_num - 1] raw_kek = [ i.text for i in title_container.find_all('p', recursive=False) ] kek = [ ' '.join(i.split()).rstrip('.!') for i in raw_kek if 'бөлүм' not in i and i.strip() ] has_bismillah_pre = BISMILLAH_LITERALS.intersection(kek) assert surah_num not in (1, 9) or not has_bismillah_pre, kek assert surah_num in (1, 9) or has_bismillah_pre, kek kek = [i for i in kek if i not in BISMILLAH_LITERALS] assert len(kek) in (0, 1), (p, surah_num, title, kek) (info, ) = kek assert '.' in info revelation_place, ayat_number_text = [ i.strip().capitalize() for i in info.split('.') ] assert { 'Меккеде': 'Makkah', 'Мединада': 'Madinah' }[revelation_place.split()[0]] == surah_info['revelation_place'] # print('\t', revelation_place, ayat_number_text) revelation_place = revelation_place.split()[0][:-2] # todo: check ayat_number_text ayattan_turat_literal = 'айаттан турат' assert ayat_number_text.endswith(ayattan_turat_literal), repr( ayat_number_text) ayat_number_text = ayat_number_text[:-len(ayattan_turat_literal )].rstrip() ayat_number = ky_number_text_to_int(ayat_number_text) assert ayat_number == surah_info['verses_count'], ( ayat_number_text, '->', ayat_number, '!=', surah_info['verses_count']) assert int_to_ky_text(ayat_number).capitalize( ) == ayat_number_text, (int_to_ky_text(ayat_number).capitalize(), ayat_number_text) # print(ky_to_int_vals) # content parsing content_lists = soup.find_all('ol') # content_lines = soup.find_all('li') content_lines = IT.chain.from_iterable( ol.find_all('li') for ol in content_lists) # assert len(list(content_lines)) == len(soup.find_all('li')) content_lines = list(content_lines) # contains_ayah_number_regexp = re.compile(r'^\d+\.', re.UNICODE) # assert not any(contains_ayah_number_regexp.match(l) for l in content_lines) cll = len(content_lines) assert cll == ayat_number, (cll, ayat_number) def process_content_line(el): link_contents = [] i = 0 for link in el.find_all('a', {'class': 'sdfootnoteanc'}): link_cont = get_link_content(link) if link_cont: link_contents.append(link_cont) link.replace_with('{{{}}}'.format(i)) i += 1 else: print('\tempty note:', surah_num, title, link['href']) link.decompose() text = ' '.join(el.text.split()) text, notes = link_positions(text) return text, [(str_index, link_contents[i]) for str_index, i in notes] verses = list(map(process_content_line, content_lines)) surah = Surah( number=surah_info['number'], title=title, verses_count=ayat_number, revelation_place=revelation_place, chronological_order=surah_info['chronological_order'], bismillah_pre=bool(has_bismillah_pre), title_note=title_note_content, ) def make_verse(index, verse_text, notes): verse = Verse(number=index + 1, text=verse_text) verse.notes = [ Note(text_position=str_index, text=text) for str_index, text in notes ] return verse surah.verses = [ make_verse(i, t, n) for i, (t, n) in enumerate(verses) ] session.add(surah) session.commit() print(surah) assert surah.id == surah_info[ 'number'] == surah.number == findex + 1
def main(): """ The main executable function """ parser = make_argument_parser() args = parser.parse_args() input_dirs = args.inputdirs tf = args.factor valid_chroms = args.validchroms valid_input_dirs = args.validinputdirs test_chroms = args.testchroms epochs = args.epochs patience = args.patience learningrate = args.learningrate seed = args.seed utils.set_seed(seed) dropout_rate = args.dropout L = args.seqlen w = args.motifwidth utils.L = L utils.w = w utils.w2 = w / 2 negatives = args.negatives assert negatives > 0 meta = args.meta gencode = args.gencode motif = args.motif num_motifs = args.kernels num_recurrent = args.recurrent num_dense = args.dense features = ['bigwig'] if tf: print 'Single-task training:', tf singleTask = True if meta: print 'Including metadata features' features.append('meta') if gencode: print 'Including genome annotations' features.append('gencode') else: print 'Multi-task training' singleTask = False #Cannot use any metadata features assert not meta assert not gencode if args.outputdir is None: clobber = True output_dir = args.outputdirc else: clobber = False output_dir = args.outputdir try: # adapted from dreme.py by T. Bailey os.makedirs(output_dir) except OSError as exc: if exc.errno == errno.EEXIST: if not clobber: print >> sys.stderr, ( 'output directory (%s) already exists ' 'but you specified not to clobber it') % output_dir sys.exit(1) else: print >> sys.stderr, ('output directory (%s) already exists ' 'so it will be clobbered') % output_dir print 'Loading genome' genome = utils.load_genome() if valid_input_dirs: print 'You specified at least one validation input directory' assert singleTask # This option only works for single-task training print 'Loading ChIP labels' if singleTask: chip_bed_list, nonnegative_regions_bed_list = \ utils.load_chip_singleTask(input_dirs, tf) if valid_input_dirs: valid_chip_bed_list, valid_nonnegative_regions_bed_list = \ utils.load_chip_singleTask(valid_input_dirs, tf) num_tfs = 1 else: assert len( input_dirs) == 1 # multi-task training only supports one cell line input_dir = input_dirs[0] tfs, positive_windows, y_positive, nonnegative_regions_bed = \ utils.load_chip_multiTask(input_dir) num_tfs = len(tfs) print 'Loading bigWig data' bigwig_names, bigwig_files_list = utils.load_bigwigs(input_dirs) num_bigwigs = len(bigwig_names) if valid_input_dirs: valid_bigwig_names, valid_bigwig_files_list = utils.load_bigwigs( valid_input_dirs) assert valid_bigwig_names == bigwig_names if not singleTask: bigwig_files = bigwig_files_list[0] if meta: print 'Loading metadata features' meta_names, meta_list = utils.load_meta(input_dirs) if valid_input_dirs: valid_meta_names, valid_meta_list = utils.load_load( valid_input_dirs) assert valid_meta_names == meta_names else: # meta option was not selected, pass empty metadata features to the functions meta_list = [[] for bigwig_files in bigwig_files_list] if valid_input_dirs: valid_meta_list = [[] for bigwig_files in valid_bigwig_files_list] print 'Making features' if singleTask: if not valid_input_dirs: #validation directories not used, must pass placeholder values valid_chip_bed_list = None valid_nonnegative_regions_bed_list = None valid_bigwig_files_list = None valid_meta_list = None datagen_train, datagen_valid = \ utils.make_features_singleTask(chip_bed_list, nonnegative_regions_bed_list, bigwig_files_list, bigwig_names, meta_list, gencode, genome, epochs, negatives, valid_chroms, test_chroms, valid_chip_bed_list, valid_nonnegative_regions_bed_list, valid_bigwig_files_list, valid_meta_list) else: datagen_train, datagen_valid = \ utils.make_features_multiTask(positive_windows, y_positive, nonnegative_regions_bed, bigwig_files, bigwig_names, genome, epochs, valid_chroms, test_chroms) print 'Building model' if num_recurrent == 0: print 'You specified 0 LSTM units. Omitting BLSTM layer' if num_recurrent < 0: print 'You specified less than 0 LSTM units. Replacing BLSTM layer with global max-pooling layer' if meta or gencode: num_meta = 0 if meta: num_meta = len(meta_names) if gencode: num_meta += 6 model = utils.make_meta_model(num_tfs, num_bigwigs, num_meta, num_motifs, num_recurrent, num_dense, dropout_rate) else: model = utils.make_model(num_tfs, num_bigwigs, num_motifs, num_recurrent, num_dense, dropout_rate) if motif: assert singleTask # This option only works with single-task training motifs_db = utils.load_motif_db('resources/HOCOMOCOv9.meme') if tf in motifs_db: print 'Injecting canonical motif' pwm = motifs_db[tf] pwm += 0.001 pwm = pwm / pwm.sum(axis=1)[:, np.newaxis] pwm = np.log2(pwm / 0.25) utils.inject_pwm(model, pwm) output_tf_file = open(output_dir + '/chip.txt', 'w') if singleTask: output_tf_file.write("%s\n" % tf) else: for tf in tfs: output_tf_file.write("%s\n" % tf) output_tf_file.close() output_feature_file = open(output_dir + '/feature.txt', 'w') for feature in features: output_feature_file.write("%s\n" % feature) output_feature_file.close() output_bw_file = open(output_dir + '/bigwig.txt', 'w') for bw in bigwig_names: output_bw_file.write("%s\n" % bw) output_bw_file.close() if meta: output_meta_file = open(output_dir + '/meta.txt', 'w') for meta_name in meta_names: output_meta_file.write("%s\n" % meta_name) output_meta_file.close() model_json = model.to_json() output_json_file = open(output_dir + '/model.json', 'w') output_json_file.write(model_json) output_json_file.close() train(datagen_train, datagen_valid, model, epochs, patience, learningrate, output_dir)