def main(): srcdir = '../data' gbks = get_files(source=srcdir, endswith='gbff', isfullpath=False) for gbk in gbks: fna = gbk[:-len('.gbff')] + '.fna' if os.path.exists(fna): continue genbank_to_fasta(file=os.path.join(srcdir, gbk), output=fna) fnas = get_files(source='.', endswith='.fna', isfullpath=False) for fna in fnas: gtf = fna[:-len('.fna')] + '.gtf' if os.path.exists(gtf): continue orf_finder(fasta=fna, output=gtf, min_length=50) gtfs = get_files(source='.', endswith='.gtf', isfullpath=False) for fna, gtf, gbk in zip(fnas, gtfs, gbks): make_genbank(fasta=fna, gtf=gtf, output=f'{__file__[:-3]}_{gbk}', shape='circular')
def main(): gbkdir = '../data' gbks = get_files(source=gbkdir, isfullpath=True) X, Y = load_genbank(gbks=gbks, cuda=False) model = LSTMModel(sizes=[4, 128, 64, 1], batch_first=True, bidirectional=True, cuda=CUDA) for stage, seq_len, mini_batch_size, learning_rate in [ (1, 128, 256, 1e-3), (2, 1024, 32, 1e-4), (3, 8192, 4, 1e-5), ]: model = train_model(stage=stage, X=X, Y=Y, seq_len=seq_len, mini_batch_size=mini_batch_size, learning_rate=learning_rate, model=model) torch.save(model, f'./models/{EXPERIMENT_NAME}_stage_{stage}.model') torch.save(model, f'./models/{EXPERIMENT_NAME}.model')
def start_codons(): gbkdir = '../data' gbks = get_files(source=gbkdir, isfullpath=True) start_codon_dict = {} for gbk in gbks: chromosomes = read_genbank(file=gbk) for chromosome in chromosomes: seq = chromosome.sequence for f in chromosome.features: if f.type != 'CDS': continue if f.strand == '+': start_codon = seq[f.start - 1:f.start + 2] else: start_codon = rev_comp(seq[f.end-3:f.end]) start_codon_dict.setdefault(start_codon, 0) start_codon_dict[start_codon] += 1 for codon, count in start_codon_dict.items(): print(f'{codon}: {count}')
def main(): seqname_to_species = { 'NC_000913': 'Escherichia coli', 'NC_002505': 'Vibrio cholerae', 'NC_002516': 'Pseudomonas aeruginosa', 'NC_003098': 'Streptococcus pneumoniae', 'NC_004668': 'Enterococcus faecalis', 'NC_000915': 'Helicobacter pylori', 'NC_000964': 'Bacillus subtilis', 'NC_009089': 'Clostridioides difficile', 'NC_010729': 'Porphyromonas gingivalis', 'NC_007795': 'Staphylococcus aureus', 'NC_000962': 'Mycobacterium tuberculosis', 'NC_003198': 'Salmonella enterica', 'NC_003888': 'Streptomyces coelicolor', 'NC_016845': 'Klebsiella pneumoniae', 'NZ_CP009257': 'Acinetobacter baumannii', } gbk1s = get_files(source='../data', endswith='gbff', isfullpath=True) gbk2s = get_files(source='../experiment_006/outdir', endswith='gbff', isfullpath=True) os.makedirs('outdir', exist_ok=True) for gbk1, gbk2 in zip(gbk1s, gbk2s): left, right, inner = compare_gbks(gbk1=gbk1, gbk2=gbk2) seqname = read_genbank(gbk1)[0].seqname title = seqname_to_species[seqname] plot_venn(title=title, left=left, right=right, inner=inner, png=f'outdir/{title}.png')
def train_model( gbkdir: str, stage: int, mini_batch_size: int, seq_len: int, learning_rate: float, model: LSTMModel, model_name: str) -> LSTMModel: gbks = get_files(source=gbkdir, isfullpath=True) X, Y = load_genbank(gbks=gbks, label_length=LABEL_LENGTH) if not SOFTMAX: # Reshape for binary classification Y = Y.view(-1, 1).float() # 1D long -> 2D float X = divide_sequence(X, seq_len=seq_len, pad=True) Y = divide_sequence(Y, seq_len=seq_len, pad=True) X, Y = shuffle(X, Y) X_train, X_test = split(X, training_fraction=TRAINING_FRACTION, dim=0) Y_train, Y_test = split(Y, training_fraction=TRAINING_FRACTION, dim=0) weight = get_class_weight(Y) if CUDA: weight = weight.cuda() criterion = nn.CrossEntropyLoss(weight=weight) if SOFTMAX else nn.BCEWithLogitsLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) log_dir = f'tensorboard/{EXPERIMENT_NAME}/{model_name}_stage_{stage}' accuracy = softmax_accuracy if SOFTMAX else binary_accuracy trainer = Trainer( model=model, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, criterion=criterion, optimizer=optimizer, accuracy=accuracy, mini_batch_size=mini_batch_size, log_dir=log_dir) model = trainer.train( max_epochs=MAX_EPOCHS, overtrain_epochs=OVERTRAIN_EPOCHS) return model
def main(): model_file = '../experiment_003/models/experiment_003_model_3.model' gbkdir = '../data' cuda = torch.cuda.is_available() min_protein_len = 50 outdir = './outdir' model = load_model(file=model_file, cuda=cuda) predictor = Predictor( model=model, output_to_label=binary_output_to_label) gbks = get_files(source=gbkdir, isfullpath=True) for gbk in gbks: chromosomes = read_genbank(file=gbk) new_chromosomes = [] for chromosome in chromosomes: annotator = CDSAnnotator( predictor=predictor, min_protein_len=min_protein_len) c: Chromosome = annotator.annotate( dna=chromosome.sequence, seqname=chromosome.seqname, circular=chromosome.circular) c.genbank_locus_text = chromosome.genbank_locus_text new_chromosomes.append(c) os.makedirs(outdir, exist_ok=True) write_genbank( data=new_chromosomes, file=f'{outdir}/{EXPERIMENT_NAME}_{os.path.basename(gbk)}', use_locus_text=True)
def validate_model(): """ Training with seq_len = 8192 kept failing I just want to see how the model performs on such a long sequence """ gbkdir = '../data' seq_len = 8192 mini_batch_size = 32 gbks = get_files(source=gbkdir, isfullpath=True) X, Y = load_genbank(gbks=gbks, cuda=False) X, Y = divide_sequence(X=X, Y=Y, seq_len=seq_len) X, Y = shuffle(X=X, Y=Y) X_train, X_test = split(X, training_fraction=TRAINING_FRACTION) Y_train, Y_test = split(Y, training_fraction=TRAINING_FRACTION) model = torch.load(f'./models/{EXPERIMENT_NAME}_stage_2.model') criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(model.parameters()) writer = SummaryWriter( log_dir=f'tensorboard/{EXPERIMENT_NAME}/validate_seq_len_{seq_len}') trainer = Trainer(model=model, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, criterion=criterion, optimizer=optimizer, mini_batch_size=mini_batch_size, writer=writer) trainer.validate() writer.close()
def main(): true_gbk_dir = '../data' lstm_gbk_dir = '../experiment_006/outdir' orffinder_gbk_dir = '../experiment_009/outdir' output_csv = f'{__file__[:-3]}.csv' columns = [ 'Species', 'LSTM Precision', 'LSTM Recall', 'ORFfinder Precision', 'ORFfinder Recall' ] seqname_to_species = { 'NC_000913': 'Escherichia coli', 'NC_002505': 'Vibrio cholerae', 'NC_002516': 'Pseudomonas aeruginosa', 'NC_003098': 'Streptococcus pneumoniae', 'NC_004668': 'Enterococcus faecalis', 'NC_000915': 'Helicobacter pylori', 'NC_000964': 'Bacillus subtilis', 'NC_009089': 'Clostridioides difficile', 'NC_010729': 'Porphyromonas gingivalis', 'NC_007795': 'Staphylococcus aureus', 'NC_000962': 'Mycobacterium tuberculosis', 'NC_003198': 'Salmonella enterica', 'NC_003888': 'Streptomyces coelicolor', 'NC_016845': 'Klebsiella pneumoniae', 'NZ_CP009257': 'Acinetobacter baumannii', } fnames = get_files(source=true_gbk_dir, endswith='gbff', isfullpath=False) df = pd.DataFrame(columns=columns) for fname in fnames: true_gbk = get_files(source=true_gbk_dir, endswith=fname, isfullpath=True)[0] lstm_gbk = get_files(source=lstm_gbk_dir, endswith=fname, isfullpath=True)[0] orffinder_gbk = get_files(source=orffinder_gbk_dir, endswith=fname, isfullpath=True)[0] seqname = read_genbank(true_gbk)[0].seqname species = seqname_to_species[seqname] lstm_precision, lstm_recall = get_precision_recall( true_gbk=true_gbk, predicted_gbk=lstm_gbk) orffinder_precision, orffinder_recall = get_precision_recall( true_gbk=true_gbk, predicted_gbk=orffinder_gbk) data = [ species, lstm_precision, lstm_recall, orffinder_precision, orffinder_recall ] row = {key: val for key, val in zip(columns, data)} df = df.append(row, ignore_index=True) df.to_csv(output_csv, index=False)