Ejemplo n.º 1
0
def main():

    srcdir = '../data'

    gbks = get_files(source=srcdir, endswith='gbff', isfullpath=False)

    for gbk in gbks:
        fna = gbk[:-len('.gbff')] + '.fna'
        if os.path.exists(fna):
            continue
        genbank_to_fasta(file=os.path.join(srcdir, gbk), output=fna)

    fnas = get_files(source='.', endswith='.fna', isfullpath=False)

    for fna in fnas:
        gtf = fna[:-len('.fna')] + '.gtf'
        if os.path.exists(gtf):
            continue
        orf_finder(fasta=fna, output=gtf, min_length=50)

    gtfs = get_files(source='.', endswith='.gtf', isfullpath=False)

    for fna, gtf, gbk in zip(fnas, gtfs, gbks):
        make_genbank(fasta=fna,
                     gtf=gtf,
                     output=f'{__file__[:-3]}_{gbk}',
                     shape='circular')
Ejemplo n.º 2
0
def main():
    gbkdir = '../data'

    gbks = get_files(source=gbkdir, isfullpath=True)
    X, Y = load_genbank(gbks=gbks, cuda=False)

    model = LSTMModel(sizes=[4, 128, 64, 1],
                      batch_first=True,
                      bidirectional=True,
                      cuda=CUDA)

    for stage, seq_len, mini_batch_size, learning_rate in [
        (1, 128, 256, 1e-3),
        (2, 1024, 32, 1e-4),
        (3, 8192, 4, 1e-5),
    ]:

        model = train_model(stage=stage,
                            X=X,
                            Y=Y,
                            seq_len=seq_len,
                            mini_batch_size=mini_batch_size,
                            learning_rate=learning_rate,
                            model=model)

        torch.save(model, f'./models/{EXPERIMENT_NAME}_stage_{stage}.model')

    torch.save(model, f'./models/{EXPERIMENT_NAME}.model')
Ejemplo n.º 3
0
def start_codons():

    gbkdir = '../data'
    gbks = get_files(source=gbkdir, isfullpath=True)
    start_codon_dict = {}

    for gbk in gbks:
        chromosomes = read_genbank(file=gbk)

        for chromosome in chromosomes:
            seq = chromosome.sequence

            for f in chromosome.features:
                if f.type != 'CDS':
                    continue
                if f.strand == '+':
                    start_codon = seq[f.start - 1:f.start + 2]
                else:
                    start_codon = rev_comp(seq[f.end-3:f.end])

                start_codon_dict.setdefault(start_codon, 0)
                start_codon_dict[start_codon] += 1

    for codon, count in start_codon_dict.items():
        print(f'{codon}: {count}')
Ejemplo n.º 4
0
def main():

    seqname_to_species = {
        'NC_000913': 'Escherichia coli',
        'NC_002505': 'Vibrio cholerae',
        'NC_002516': 'Pseudomonas aeruginosa',
        'NC_003098': 'Streptococcus pneumoniae',
        'NC_004668': 'Enterococcus faecalis',
        'NC_000915': 'Helicobacter pylori',
        'NC_000964': 'Bacillus subtilis',
        'NC_009089': 'Clostridioides difficile',
        'NC_010729': 'Porphyromonas gingivalis',
        'NC_007795': 'Staphylococcus aureus',
        'NC_000962': 'Mycobacterium tuberculosis',
        'NC_003198': 'Salmonella enterica',
        'NC_003888': 'Streptomyces coelicolor',
        'NC_016845': 'Klebsiella pneumoniae',
        'NZ_CP009257': 'Acinetobacter baumannii',
    }

    gbk1s = get_files(source='../data', endswith='gbff', isfullpath=True)

    gbk2s = get_files(source='../experiment_006/outdir',
                      endswith='gbff',
                      isfullpath=True)

    os.makedirs('outdir', exist_ok=True)
    for gbk1, gbk2 in zip(gbk1s, gbk2s):

        left, right, inner = compare_gbks(gbk1=gbk1, gbk2=gbk2)

        seqname = read_genbank(gbk1)[0].seqname
        title = seqname_to_species[seqname]

        plot_venn(title=title,
                  left=left,
                  right=right,
                  inner=inner,
                  png=f'outdir/{title}.png')
Ejemplo n.º 5
0
def train_model(
        gbkdir: str,
        stage: int,
        mini_batch_size: int,
        seq_len: int,
        learning_rate: float,
        model: LSTMModel,
        model_name: str) -> LSTMModel:

    gbks = get_files(source=gbkdir, isfullpath=True)
    X, Y = load_genbank(gbks=gbks, label_length=LABEL_LENGTH)

    if not SOFTMAX:  # Reshape for binary classification
        Y = Y.view(-1, 1).float()  # 1D long -> 2D float

    X = divide_sequence(X, seq_len=seq_len, pad=True)
    Y = divide_sequence(Y, seq_len=seq_len, pad=True)

    X, Y = shuffle(X, Y)

    X_train, X_test = split(X, training_fraction=TRAINING_FRACTION, dim=0)
    Y_train, Y_test = split(Y, training_fraction=TRAINING_FRACTION, dim=0)

    weight = get_class_weight(Y)
    if CUDA:
        weight = weight.cuda()

    criterion = nn.CrossEntropyLoss(weight=weight) if SOFTMAX else nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    log_dir = f'tensorboard/{EXPERIMENT_NAME}/{model_name}_stage_{stage}'
    accuracy = softmax_accuracy if SOFTMAX else binary_accuracy

    trainer = Trainer(
        model=model,
        X_train=X_train,
        Y_train=Y_train,
        X_test=X_test,
        Y_test=Y_test,
        criterion=criterion,
        optimizer=optimizer,
        accuracy=accuracy,
        mini_batch_size=mini_batch_size,
        log_dir=log_dir)

    model = trainer.train(
        max_epochs=MAX_EPOCHS,
        overtrain_epochs=OVERTRAIN_EPOCHS)

    return model
Ejemplo n.º 6
0
def main():

    model_file = '../experiment_003/models/experiment_003_model_3.model'
    gbkdir = '../data'
    cuda = torch.cuda.is_available()
    min_protein_len = 50
    outdir = './outdir'

    model = load_model(file=model_file, cuda=cuda)

    predictor = Predictor(
        model=model,
        output_to_label=binary_output_to_label)

    gbks = get_files(source=gbkdir, isfullpath=True)

    for gbk in gbks:

        chromosomes = read_genbank(file=gbk)
        new_chromosomes = []

        for chromosome in chromosomes:

            annotator = CDSAnnotator(
                predictor=predictor,
                min_protein_len=min_protein_len)

            c: Chromosome = annotator.annotate(
                dna=chromosome.sequence,
                seqname=chromosome.seqname,
                circular=chromosome.circular)

            c.genbank_locus_text = chromosome.genbank_locus_text

            new_chromosomes.append(c)

        os.makedirs(outdir, exist_ok=True)
        write_genbank(
            data=new_chromosomes,
            file=f'{outdir}/{EXPERIMENT_NAME}_{os.path.basename(gbk)}',
            use_locus_text=True)
Ejemplo n.º 7
0
def validate_model():
    """
    Training with seq_len = 8192 kept failing
    I just want to see how the model performs on such a long sequence
    """
    gbkdir = '../data'
    seq_len = 8192
    mini_batch_size = 32

    gbks = get_files(source=gbkdir, isfullpath=True)
    X, Y = load_genbank(gbks=gbks, cuda=False)

    X, Y = divide_sequence(X=X, Y=Y, seq_len=seq_len)
    X, Y = shuffle(X=X, Y=Y)
    X_train, X_test = split(X, training_fraction=TRAINING_FRACTION)
    Y_train, Y_test = split(Y, training_fraction=TRAINING_FRACTION)

    model = torch.load(f'./models/{EXPERIMENT_NAME}_stage_2.model')
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters())

    writer = SummaryWriter(
        log_dir=f'tensorboard/{EXPERIMENT_NAME}/validate_seq_len_{seq_len}')

    trainer = Trainer(model=model,
                      X_train=X_train,
                      Y_train=Y_train,
                      X_test=X_test,
                      Y_test=Y_test,
                      criterion=criterion,
                      optimizer=optimizer,
                      mini_batch_size=mini_batch_size,
                      writer=writer)

    trainer.validate()

    writer.close()
Ejemplo n.º 8
0
def main():

    true_gbk_dir = '../data'
    lstm_gbk_dir = '../experiment_006/outdir'
    orffinder_gbk_dir = '../experiment_009/outdir'
    output_csv = f'{__file__[:-3]}.csv'

    columns = [
        'Species', 'LSTM Precision', 'LSTM Recall', 'ORFfinder Precision',
        'ORFfinder Recall'
    ]

    seqname_to_species = {
        'NC_000913': 'Escherichia coli',
        'NC_002505': 'Vibrio cholerae',
        'NC_002516': 'Pseudomonas aeruginosa',
        'NC_003098': 'Streptococcus pneumoniae',
        'NC_004668': 'Enterococcus faecalis',
        'NC_000915': 'Helicobacter pylori',
        'NC_000964': 'Bacillus subtilis',
        'NC_009089': 'Clostridioides difficile',
        'NC_010729': 'Porphyromonas gingivalis',
        'NC_007795': 'Staphylococcus aureus',
        'NC_000962': 'Mycobacterium tuberculosis',
        'NC_003198': 'Salmonella enterica',
        'NC_003888': 'Streptomyces coelicolor',
        'NC_016845': 'Klebsiella pneumoniae',
        'NZ_CP009257': 'Acinetobacter baumannii',
    }

    fnames = get_files(source=true_gbk_dir, endswith='gbff', isfullpath=False)

    df = pd.DataFrame(columns=columns)

    for fname in fnames:

        true_gbk = get_files(source=true_gbk_dir,
                             endswith=fname,
                             isfullpath=True)[0]

        lstm_gbk = get_files(source=lstm_gbk_dir,
                             endswith=fname,
                             isfullpath=True)[0]

        orffinder_gbk = get_files(source=orffinder_gbk_dir,
                                  endswith=fname,
                                  isfullpath=True)[0]

        seqname = read_genbank(true_gbk)[0].seqname
        species = seqname_to_species[seqname]

        lstm_precision, lstm_recall = get_precision_recall(
            true_gbk=true_gbk, predicted_gbk=lstm_gbk)

        orffinder_precision, orffinder_recall = get_precision_recall(
            true_gbk=true_gbk, predicted_gbk=orffinder_gbk)

        data = [
            species, lstm_precision, lstm_recall, orffinder_precision,
            orffinder_recall
        ]

        row = {key: val for key, val in zip(columns, data)}

        df = df.append(row, ignore_index=True)

    df.to_csv(output_csv, index=False)