Example #1
0
def run_layer(train_data, dev_data, test_data, task_name, model_name, mode,
              layer):
    task_dir = os.path.join(task_name, 'probes', config.name, mode)
    probe_path = data_path(task_dir, 'train', 'pt', model_name, layer)

    if config.train:
        print('\nstart training layer {} of {} with task {} with config {}'.
              format(layer, model_name, task_name, config.name))

        logdir = os.path.join(config.data.logdir, task_name, config.name,
                              model_name, mode,
                              str(layer).zfill(2))

        if os.path.exists(probe_path):
            print('skipping, {} already exists'.format(probe_path))
        else:
            train(train_data, dev_data, mode, layer, logdir,
                  probe_path + '.tmp')
            shutil.move(probe_path + '.tmp', probe_path)

    if config.export:
        if not os.path.exists(probe_path):
            print('skipping, {} does not exist'.format(probe_path))
        else:
            summary_dir = os.path.join(task_name, 'summaries', config.name,
                                       mode)
            summary_path = data_path(summary_dir, mode, 'json', model_name,
                                     layer)

            summary, labels, preds = summarize(mode, layer, probe_path,
                                               test_data)
            with open(summary_path, 'w') as f:
                json.dump(summary, f)

            return labels, preds

    return None, None
Example #2
0
def read_text():
    # 打开文件流
    with open(data_path(), "r", encoding="utf-8") as f:
        # 创建一个列表
        arr = []
        # 读取json 文件中的数据
        sss = json.load(f)
        # 通过json文件中的键,取出值
        username = sss.get("username")
        mobile = sss.get("mobile")
        workNumber = sss.get("workNumber")
        # 把取出json文件中的数据存入列表中
        arr.append((username, mobile, workNumber))
        print(arr)
    # 把列表值返回到函数中
    return arr
Example #3
0
from data import data_path
from gtfparse import read_gtf_as_dataframe
from nose.tools import eq_

ENSEMBL_GTF_PATH = data_path("ensembl_grch37.head.gtf")

EXPECTED_FEATURES = set(["gene", "transcript", "exon", "CDS", "UTR", "start_codon", "stop_codon"])


def test_ensembl_gtf_columns():
    df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH)
    features = set(df["feature"])
    eq_(features, EXPECTED_FEATURES)


# first 1000 lines of GTF only contained these genes
EXPECTED_GENE_NAMES = {
    "FAM41C",
    "CICP27",
    "RNU6-1100P",
    "NOC2L",
    "AP006222.1",
    "LINC01128",
    "RP4-669L17.1",
    "RP11-206L10.2",
    "PLEKHN1",
    "WBP1LP7",
    "RP5-857K21.1",
    "RP5-857K21.5",
    "RNU6-1199P",
    "RP11-206L10.10",
Example #4
0
def run_task(task_name, task_data, task_format, model_name, model_id):
    mode = config.layer_mode
    train_data, dev_data, test_data = None, None, None

    if config.train:
        dev_data = load_embeddings(task_name, task_data, task_format, 'dev',
                                   model_name, model_id, config.label_map)
        train_data = dev_data if config.sample else load_embeddings(
            task_name, task_data, task_format, 'train', model_name, model_id,
            config.label_map)

    if config.export:
        test_data = load_embeddings(task_name, task_data, task_format, 'test',
                                    model_name, model_id, config.label_map)

    if config.dry_run:
        print('loaded data, now stopping dry run')
        return

    layers_labels = []

    n_workers = config.num_workers if config.num_workers > 0 else 1
    if n_workers == 1:
        for layer in range(*config.layer_range):
            layer_labels, layer_preds = run_layer(train_data, dev_data,
                                                  test_data, task_name,
                                                  model_name, mode, layer)
            if config.export and layer_labels is not None:
                if len(layers_labels) == 0:
                    layers_labels.append(layer_labels)
                layers_labels.append(layer_preds)
    else:
        procs_queue, procs_running = [], []
        for layer in range(*config.layer_range):
            p = mp.Process(target=run_layer,
                           args=(train_data, dev_data, test_data, task_name,
                                 model_name, mode, layer))
            procs_queue.append(p)
            # run_layer(train_data, dev_data, test_data, task_name, model_name, mode, layer)

        for p in procs_queue:
            while len(procs_running) >= n_workers:
                time.sleep(1)
                for i in range(n_workers):
                    if not procs_running[i].is_alive():
                        procs_running.pop(i)
                        break

            procs_running.append(p)
            p.start()

        for p in procs_running:
            p.join()

    if len(layers_labels) > 0:
        preds_dir = os.path.join(task_name, 'predictions', config.name)
        preds_path = data_path(preds_dir, mode, 'json', model_name)

        with open(preds_path, 'w') as f:
            for labels in zip(*layers_labels):
                labels = [config.label_map[lab] for lab in labels]
                f.write('\t'.join(labels) + '\n')

        print(f'Saved layer-wise predictions to {preds_path}')

    if config.report:
        summaries = []

        for layer in range(*config.layer_range):
            summary_dir = os.path.join(task_name, 'summaries', config.name,
                                       mode)
            summary_path = data_path(summary_dir, mode, 'json', model_name,
                                     layer)

            if not os.path.exists(summary_path):
                print('skipping, {} does not exist'.format(summary_path))
            else:
                with open(summary_path) as f:
                    summary = json.load(f)
                summaries.append((layer, summary))

        report(summaries)
Example #5
0
from gtfparse import read_gtf
from data import data_path

B16_GTF_PATH = data_path("B16.stringtie.head.gtf")

def _check_required_columns(gtf_dict):
    assert "feature" in gtf_dict, "Expected column named 'feature' in StringTie GTF"
    assert "cov" in gtf_dict, "Expected column named 'cov' in StringTie GTF"
    assert "FPKM" in gtf_dict, "Expected column named 'FPKM' in StringTie GTF"
    features = set(gtf_dict["feature"])
    assert "exon" in features, "No exons in GTF (available: %s)" % features
    assert "transcript" in features, "No transcripts in GTF (available: %s)" % features

def _check_string_cov_and_FPKM(gtf_dict):
    for i, feature_name in enumerate(gtf_dict["feature"]):
        cov = gtf_dict["cov"][i]
        fpkm = gtf_dict["FPKM"][i]
        if feature_name == "exon":
            assert len(fpkm) == 0, \
                "Expected missing FPKM for exon, got %s" % (fpkm,)
            assert len(cov) > 0 and float(cov) >= 0, \
                "Expected non-negative cov for exon, got %s" % (cov,)
        elif feature_name == "transcript":
            assert len(cov) and float(cov) >= 0, \
                "Expected non-negative cov for transcript, got %s" % (cov,)
            assert len(fpkm) > 0 and float(fpkm) >= 0, \
                "Expected non-negative FPKM for transcript, got %s" % (fpkm,)

def _check_float_cov_and_FPKM(gtf_dict):
    for i, feature_name in enumerate(gtf_dict["feature"]):
        cov = gtf_dict["cov"][i]
Example #6
0
from gtfparse import read_gtf_as_dict, read_gtf_as_dataframe
from data import data_path

REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf")

def _check_required_columns(gtf_dict):
    assert "feature" in gtf_dict, "Expected column named 'feature' in RefSeq GTF"
    assert "gene_id" in gtf_dict, "Expected column named 'gene_id' in RefSeq GTF"
    assert "transcript_id" in gtf_dict, "Expected column named 'transcript_id' in RefSeq GTF"
    features = set(gtf_dict["feature"])
    assert "exon" in features, "No exon features in GTF (available: %s)" % features
    assert "CDS" in features, "No CDS features in GTF (available: %s)" % features

def test_read_refseq_gtf_as_dict():
    gtf_dict = read_gtf_as_dict(REFSEQ_GTF_PATH)
    _check_required_columns(gtf_dict)

def test_read_refseq_gtf_as_dataframe():
    gtf_df = read_gtf_as_dataframe(REFSEQ_GTF_PATH)
    _check_required_columns(gtf_df)
from gtfparse import read_gtf_as_dict, read_gtf_as_dataframe
from data import data_path

B16_GTF_PATH = data_path("B16.stringtie.head.gtf")

def _check_required_columns(gtf_dict):
    assert "feature" in gtf_dict, "Expected column named 'feature' in StringTie GTF"
    assert "cov" in gtf_dict, "Expected column named 'cov' in StringTie GTF"
    assert "FPKM" in gtf_dict, "Expected column named 'FPKM' in StringTie GTF"
    features = set(gtf_dict["feature"])
    assert "exon" in features, "No exons in GTF (available: %s)" % features
    assert "transcript" in features, "No transcripts in GTF (available: %s)" % features

def _check_string_cov_and_FPKM(gtf_dict):
    for i, feature_name in enumerate(gtf_dict["feature"]):
        cov = gtf_dict["cov"][i]
        fpkm = gtf_dict["FPKM"][i]
        if feature_name == "exon":
            assert len(fpkm) == 0, \
                "Expected missing FPKM for exon, got %s" % (fpkm,)
            assert  len(cov) > 0 and float(cov) >= 0, \
                "Expected non-negative cov for exon, got %s" % (cov,)
        elif feature_name == "transcript":
            assert len(cov) and float(cov) >= 0, \
                "Expected non-negative cov for transcript, got %s" % (cov,)
            assert len(fpkm) > 0 and float(fpkm) >= 0, \
                "Expected non-negative FPKM for transcript, got %s" % (fpkm,)

def _check_float_cov_and_FPKM(gtf_dict):
    for i, feature_name in enumerate(gtf_dict["feature"]):
        cov = gtf_dict["cov"][i]
Example #8
0
from gtfparse import read_gtf
from data import data_path

REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf")


def _check_required_columns(gtf_dict):
    assert "feature" in gtf_dict, "Expected column named 'feature' in RefSeq GTF"
    assert "gene_id" in gtf_dict, "Expected column named 'gene_id' in RefSeq GTF"
    assert "transcript_id" in gtf_dict, "Expected column named 'transcript_id' in RefSeq GTF"
    features = set(gtf_dict["feature"])
    assert "exon" in features, "No exon features in GTF (available: %s)" % features
    assert "CDS" in features, "No CDS features in GTF (available: %s)" % features


def test_read_refseq_gtf_as_dataframe():
    gtf_df = read_gtf(REFSEQ_GTF_PATH)
    _check_required_columns(gtf_df)
Example #9
0
from data import data_path
from gtfparse import read_gtf
from nose.tools import eq_

ENSEMBL_GTF_PATH = data_path("ensembl_grch37.head.gtf")

EXPECTED_FEATURES = set([
    "gene",
    "transcript",
    "exon",
    "CDS",
    "UTR",
    "start_codon",
    "stop_codon",
])


def test_ensembl_gtf_columns():
    df = read_gtf(ENSEMBL_GTF_PATH)
    features = set(df["feature"])
    eq_(features, EXPECTED_FEATURES)


# first 1000 lines of GTF only contained these genes
EXPECTED_GENE_NAMES = {
    'FAM41C', 'CICP27', 'RNU6-1100P', 'NOC2L', 'AP006222.1', 'LINC01128',
    'RP4-669L17.1', 'RP11-206L10.2', 'PLEKHN1', 'WBP1LP7', 'RP5-857K21.1',
    'RP5-857K21.5', 'RNU6-1199P', 'RP11-206L10.10', 'RP11-54O7.16', 'CICP7',
    'AL627309.1', 'RP5-857K21.11', 'DDX11L1', 'RP5-857K21.3', 'RP11-34P13.7',
    'AL669831.1', 'MTATP6P1', 'CICP3', 'WBP1LP6', 'LINC00115', 'hsa-mir-6723',
    'RP5-857K21.7', 'SAMD11', 'RP11-206L10.5', 'RP11-34P13.8', 'RP11-206L10.9',
Example #10
0
# setup random seeds
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

# ----------------------------------------------------------------------------------------------------------------- #
if args.dataset != "mscoco":
    DataField = NormalField
    TRG = DataField(init_token='<init>', eos_token='<eos>', batch_first=True)
    SRC = DataField(batch_first=True) if not args.share_vocab else TRG
    # NOTE : UNK, PAD, INIT, EOS

# setup many datasets (need to manaually setup)
data_prefix = Path(data_path(args.dataset))
args.data_prefix = data_prefix
if args.dataset == "mscoco":
    data_prefix = str(data_prefix)
train_dir = "train" if not args.use_distillation else "distill/" + args.dataset[
    -4:]
if args.dataset == 'iwslt-ende' or args.dataset == 'iwslt-deen':
    #if args.resume:
    #   train_dir += "2"
    logger.info("TRAINING CORPUS : " +
                str(data_prefix / train_dir / 'train.tags.en-de.bpe'))
    train_data = NormalTranslationDataset(path=str(data_prefix / train_dir / 'train.tags.en-de.bpe'),
    exts=('.{}'.format(args.src), '.{}'.format(args.trg)), fields=(SRC, TRG),
    load_dataset=args.load_dataset, save_dataset=args.save_dataset, prefix='normal') \
        if args.mode in ["train", "distill"] else None