def run_layer(train_data, dev_data, test_data, task_name, model_name, mode, layer): task_dir = os.path.join(task_name, 'probes', config.name, mode) probe_path = data_path(task_dir, 'train', 'pt', model_name, layer) if config.train: print('\nstart training layer {} of {} with task {} with config {}'. format(layer, model_name, task_name, config.name)) logdir = os.path.join(config.data.logdir, task_name, config.name, model_name, mode, str(layer).zfill(2)) if os.path.exists(probe_path): print('skipping, {} already exists'.format(probe_path)) else: train(train_data, dev_data, mode, layer, logdir, probe_path + '.tmp') shutil.move(probe_path + '.tmp', probe_path) if config.export: if not os.path.exists(probe_path): print('skipping, {} does not exist'.format(probe_path)) else: summary_dir = os.path.join(task_name, 'summaries', config.name, mode) summary_path = data_path(summary_dir, mode, 'json', model_name, layer) summary, labels, preds = summarize(mode, layer, probe_path, test_data) with open(summary_path, 'w') as f: json.dump(summary, f) return labels, preds return None, None
def read_text(): # 打开文件流 with open(data_path(), "r", encoding="utf-8") as f: # 创建一个列表 arr = [] # 读取json 文件中的数据 sss = json.load(f) # 通过json文件中的键,取出值 username = sss.get("username") mobile = sss.get("mobile") workNumber = sss.get("workNumber") # 把取出json文件中的数据存入列表中 arr.append((username, mobile, workNumber)) print(arr) # 把列表值返回到函数中 return arr
from data import data_path from gtfparse import read_gtf_as_dataframe from nose.tools import eq_ ENSEMBL_GTF_PATH = data_path("ensembl_grch37.head.gtf") EXPECTED_FEATURES = set(["gene", "transcript", "exon", "CDS", "UTR", "start_codon", "stop_codon"]) def test_ensembl_gtf_columns(): df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH) features = set(df["feature"]) eq_(features, EXPECTED_FEATURES) # first 1000 lines of GTF only contained these genes EXPECTED_GENE_NAMES = { "FAM41C", "CICP27", "RNU6-1100P", "NOC2L", "AP006222.1", "LINC01128", "RP4-669L17.1", "RP11-206L10.2", "PLEKHN1", "WBP1LP7", "RP5-857K21.1", "RP5-857K21.5", "RNU6-1199P", "RP11-206L10.10",
def run_task(task_name, task_data, task_format, model_name, model_id): mode = config.layer_mode train_data, dev_data, test_data = None, None, None if config.train: dev_data = load_embeddings(task_name, task_data, task_format, 'dev', model_name, model_id, config.label_map) train_data = dev_data if config.sample else load_embeddings( task_name, task_data, task_format, 'train', model_name, model_id, config.label_map) if config.export: test_data = load_embeddings(task_name, task_data, task_format, 'test', model_name, model_id, config.label_map) if config.dry_run: print('loaded data, now stopping dry run') return layers_labels = [] n_workers = config.num_workers if config.num_workers > 0 else 1 if n_workers == 1: for layer in range(*config.layer_range): layer_labels, layer_preds = run_layer(train_data, dev_data, test_data, task_name, model_name, mode, layer) if config.export and layer_labels is not None: if len(layers_labels) == 0: layers_labels.append(layer_labels) layers_labels.append(layer_preds) else: procs_queue, procs_running = [], [] for layer in range(*config.layer_range): p = mp.Process(target=run_layer, args=(train_data, dev_data, test_data, task_name, model_name, mode, layer)) procs_queue.append(p) # run_layer(train_data, dev_data, test_data, task_name, model_name, mode, layer) for p in procs_queue: while len(procs_running) >= n_workers: time.sleep(1) for i in range(n_workers): if not procs_running[i].is_alive(): procs_running.pop(i) break procs_running.append(p) p.start() for p in procs_running: p.join() if len(layers_labels) > 0: preds_dir = os.path.join(task_name, 'predictions', config.name) preds_path = data_path(preds_dir, mode, 'json', model_name) with open(preds_path, 'w') as f: for labels in zip(*layers_labels): labels = [config.label_map[lab] for lab in labels] f.write('\t'.join(labels) + '\n') print(f'Saved layer-wise predictions to {preds_path}') if config.report: summaries = [] for layer in range(*config.layer_range): summary_dir = os.path.join(task_name, 'summaries', config.name, mode) summary_path = data_path(summary_dir, mode, 'json', model_name, layer) if not os.path.exists(summary_path): print('skipping, {} does not exist'.format(summary_path)) else: with open(summary_path) as f: summary = json.load(f) summaries.append((layer, summary)) report(summaries)
from gtfparse import read_gtf from data import data_path B16_GTF_PATH = data_path("B16.stringtie.head.gtf") def _check_required_columns(gtf_dict): assert "feature" in gtf_dict, "Expected column named 'feature' in StringTie GTF" assert "cov" in gtf_dict, "Expected column named 'cov' in StringTie GTF" assert "FPKM" in gtf_dict, "Expected column named 'FPKM' in StringTie GTF" features = set(gtf_dict["feature"]) assert "exon" in features, "No exons in GTF (available: %s)" % features assert "transcript" in features, "No transcripts in GTF (available: %s)" % features def _check_string_cov_and_FPKM(gtf_dict): for i, feature_name in enumerate(gtf_dict["feature"]): cov = gtf_dict["cov"][i] fpkm = gtf_dict["FPKM"][i] if feature_name == "exon": assert len(fpkm) == 0, \ "Expected missing FPKM for exon, got %s" % (fpkm,) assert len(cov) > 0 and float(cov) >= 0, \ "Expected non-negative cov for exon, got %s" % (cov,) elif feature_name == "transcript": assert len(cov) and float(cov) >= 0, \ "Expected non-negative cov for transcript, got %s" % (cov,) assert len(fpkm) > 0 and float(fpkm) >= 0, \ "Expected non-negative FPKM for transcript, got %s" % (fpkm,) def _check_float_cov_and_FPKM(gtf_dict): for i, feature_name in enumerate(gtf_dict["feature"]): cov = gtf_dict["cov"][i]
from gtfparse import read_gtf_as_dict, read_gtf_as_dataframe from data import data_path REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf") def _check_required_columns(gtf_dict): assert "feature" in gtf_dict, "Expected column named 'feature' in RefSeq GTF" assert "gene_id" in gtf_dict, "Expected column named 'gene_id' in RefSeq GTF" assert "transcript_id" in gtf_dict, "Expected column named 'transcript_id' in RefSeq GTF" features = set(gtf_dict["feature"]) assert "exon" in features, "No exon features in GTF (available: %s)" % features assert "CDS" in features, "No CDS features in GTF (available: %s)" % features def test_read_refseq_gtf_as_dict(): gtf_dict = read_gtf_as_dict(REFSEQ_GTF_PATH) _check_required_columns(gtf_dict) def test_read_refseq_gtf_as_dataframe(): gtf_df = read_gtf_as_dataframe(REFSEQ_GTF_PATH) _check_required_columns(gtf_df)
from gtfparse import read_gtf_as_dict, read_gtf_as_dataframe from data import data_path B16_GTF_PATH = data_path("B16.stringtie.head.gtf") def _check_required_columns(gtf_dict): assert "feature" in gtf_dict, "Expected column named 'feature' in StringTie GTF" assert "cov" in gtf_dict, "Expected column named 'cov' in StringTie GTF" assert "FPKM" in gtf_dict, "Expected column named 'FPKM' in StringTie GTF" features = set(gtf_dict["feature"]) assert "exon" in features, "No exons in GTF (available: %s)" % features assert "transcript" in features, "No transcripts in GTF (available: %s)" % features def _check_string_cov_and_FPKM(gtf_dict): for i, feature_name in enumerate(gtf_dict["feature"]): cov = gtf_dict["cov"][i] fpkm = gtf_dict["FPKM"][i] if feature_name == "exon": assert len(fpkm) == 0, \ "Expected missing FPKM for exon, got %s" % (fpkm,) assert len(cov) > 0 and float(cov) >= 0, \ "Expected non-negative cov for exon, got %s" % (cov,) elif feature_name == "transcript": assert len(cov) and float(cov) >= 0, \ "Expected non-negative cov for transcript, got %s" % (cov,) assert len(fpkm) > 0 and float(fpkm) >= 0, \ "Expected non-negative FPKM for transcript, got %s" % (fpkm,) def _check_float_cov_and_FPKM(gtf_dict): for i, feature_name in enumerate(gtf_dict["feature"]): cov = gtf_dict["cov"][i]
from gtfparse import read_gtf from data import data_path REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf") def _check_required_columns(gtf_dict): assert "feature" in gtf_dict, "Expected column named 'feature' in RefSeq GTF" assert "gene_id" in gtf_dict, "Expected column named 'gene_id' in RefSeq GTF" assert "transcript_id" in gtf_dict, "Expected column named 'transcript_id' in RefSeq GTF" features = set(gtf_dict["feature"]) assert "exon" in features, "No exon features in GTF (available: %s)" % features assert "CDS" in features, "No CDS features in GTF (available: %s)" % features def test_read_refseq_gtf_as_dataframe(): gtf_df = read_gtf(REFSEQ_GTF_PATH) _check_required_columns(gtf_df)
from data import data_path from gtfparse import read_gtf from nose.tools import eq_ ENSEMBL_GTF_PATH = data_path("ensembl_grch37.head.gtf") EXPECTED_FEATURES = set([ "gene", "transcript", "exon", "CDS", "UTR", "start_codon", "stop_codon", ]) def test_ensembl_gtf_columns(): df = read_gtf(ENSEMBL_GTF_PATH) features = set(df["feature"]) eq_(features, EXPECTED_FEATURES) # first 1000 lines of GTF only contained these genes EXPECTED_GENE_NAMES = { 'FAM41C', 'CICP27', 'RNU6-1100P', 'NOC2L', 'AP006222.1', 'LINC01128', 'RP4-669L17.1', 'RP11-206L10.2', 'PLEKHN1', 'WBP1LP7', 'RP5-857K21.1', 'RP5-857K21.5', 'RNU6-1199P', 'RP11-206L10.10', 'RP11-54O7.16', 'CICP7', 'AL627309.1', 'RP5-857K21.11', 'DDX11L1', 'RP5-857K21.3', 'RP11-34P13.7', 'AL669831.1', 'MTATP6P1', 'CICP3', 'WBP1LP6', 'LINC00115', 'hsa-mir-6723', 'RP5-857K21.7', 'SAMD11', 'RP11-206L10.5', 'RP11-34P13.8', 'RP11-206L10.9',
# setup random seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # ----------------------------------------------------------------------------------------------------------------- # if args.dataset != "mscoco": DataField = NormalField TRG = DataField(init_token='<init>', eos_token='<eos>', batch_first=True) SRC = DataField(batch_first=True) if not args.share_vocab else TRG # NOTE : UNK, PAD, INIT, EOS # setup many datasets (need to manaually setup) data_prefix = Path(data_path(args.dataset)) args.data_prefix = data_prefix if args.dataset == "mscoco": data_prefix = str(data_prefix) train_dir = "train" if not args.use_distillation else "distill/" + args.dataset[ -4:] if args.dataset == 'iwslt-ende' or args.dataset == 'iwslt-deen': #if args.resume: # train_dir += "2" logger.info("TRAINING CORPUS : " + str(data_prefix / train_dir / 'train.tags.en-de.bpe')) train_data = NormalTranslationDataset(path=str(data_prefix / train_dir / 'train.tags.en-de.bpe'), exts=('.{}'.format(args.src), '.{}'.format(args.trg)), fields=(SRC, TRG), load_dataset=args.load_dataset, save_dataset=args.save_dataset, prefix='normal') \ if args.mode in ["train", "distill"] else None