def main(): # Options parser = argparse.ArgumentParser() parser.add_argument('fasta', help='''Fasta file of sequences for truncation.''') parser.add_argument( 'weights', help= '''File containing model weights. This specifies which model to use.''' ) parser.add_argument( 'output', help='''File where results will be written. By default, this script will not run if the output file already exists. Use the -o option to overwrite an existing file.''' ) parser.add_argument( '-o', help= '''Use this option if you want to overwrite an existing output file.''', action='store_true') parser.add_argument( '-e', help= '''Use this option to write just the Ensembl transcript ID instead of the full header.''', action='store_true') args = parser.parse_args() ########## ## MAIN ## ########## lookup = dict(zip(range(5), 'NATCG')) if not args.o and os.path.exists(args.output): raise Exception(args.output + ' already exists!') if args.e: transpat = re.compile('ENST\d*.\d*') print "Reading input files..." full_seqs = fasta.load_fasta(args.fasta, 0) mut = [] for seq, name in full_seqs: for i in xrange(len(seq)): for b in xrange(1, 5): if b != seq[i]: mut_seq = seq[:i] + [b] + seq[i + 1:] mut.append((mut_seq, str(i), lookup[b], name)) mRNN = model.build_model(args.weights) lines = [] print "Evaluating sequences..." seqs, pos, base, name = zip(*mut) if args.e: name = [transpat.search(n).group() for n in name] scores = mRNN.batch_predict(seqs, True) scores = map(str, scores) lines = zip(name, pos, base, scores) lines = ['\t'.join(line) for line in lines] with open(args.output, 'w') as out: out.write('\n'.join(lines))
def train(posFastaFile, negFastaFile, posValFasta, negValFasta, parameters): print "Reading input files..." positives = fasta.load_fasta(posFastaFile, parameters['min_length']) negatives = fasta.load_fasta(negFastaFile, parameters['min_length']) valpos = fasta.load_fasta(posValFasta, parameters['min_length']) valneg = fasta.load_fasta(negValFasta, parameters['min_length']) train = positives, negatives val = valpos, valneg print "Building new model..." mRNN = model.build_model(parameters['weights'], parameters['embedding_size'], parameters['recurrent_gate_size'], 5, parameters['dropout']) print inspect.getmodule(mRNN.__class__) print "Training model..." mRNN = model.train_model(mRNN, train, val, parameters['epochs'], parameters['output'], parameters['max_length'], parameters['save_freq'], parameters['early_stopping']) return mRNN
def load_file(input_filename, min_peptide_length=9, max_peptide_length=31): """ Load mutatated peptides from FASTA, VCF, or MAF file. For the latter two formats, expand their variants across all annotated transcripts. Parameters -------- input_filename : str min_peptide_length : int max_peptide_length : int Returns a dataframe with columns: - chr : chomosome - pos : position in the chromosome - ref : reference DNA - alt : alternate DNA - info : gene name and entrez gene ID - stable_id_transcript : Ensembl transcript ID - SourceSequence : region of protein around mutation - MutationStart : first amino acid modified - MutationEnd : last mutated amino acid - GeneMutationInfo : original genetic variant e.g. chr3 g.484899 C>T - PeptideMutationInfo : annotation e.g. V600E """ if input_filename.endswith(".fasta") \ or input_filename.endswith(".fa"): return load_fasta(input_filename) vcf_df = load_variants(input_filename) vcf_df = vcf_df.drop_duplicates() return expand_transcripts( vcf_df, input_filename, min_peptide_length = min_peptide_length, max_peptide_length = max_peptide_length)
def load_file(input_filename, min_peptide_length=9, max_peptide_length=31): """ Load mutatated peptides from FASTA, VCF, or MAF file. For the latter two formats, expand their variants across all annotated transcripts. Parameters -------- input_filename : str min_peptide_length : int max_peptide_length : int Returns a dataframe with columns: - chr : chomosome - pos : position in the chromosome - ref : reference DNA - alt : alternate DNA - info : gene name and entrez gene ID - stable_id_transcript : Ensembl transcript ID - SourceSequence : region of protein around mutation - MutationStart : first amino acid modified - MutationEnd : last mutated amino acid - GeneMutationInfo : original genetic variant e.g. chr3 g.484899 C>T - PeptideMutationInfo : annotation e.g. V600E """ if input_filename.endswith(".fasta") \ or input_filename.endswith(".fa"): return load_fasta(input_filename) vcf_df = load_variants(input_filename) vcf_df = vcf_df.drop_duplicates() return expand_transcripts(vcf_df, input_filename, min_peptide_length=min_peptide_length, max_peptide_length=max_peptide_length)
def main(): # Options opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:p:f:", [ "help", "output=", "weights=", "epochs=", "batch_size=", "embedding_size=", "recurrent_gate_size=", "dropout=", "test=", "min_length=", "max_length=", "file_label=", ]) if len(files) != 1: usage() fastaFile = files[0] print "using fasta file: ", fastaFile # Defaults: parameters = {} parameters['output'] = None parameters['verbose'] = False parameters['weights'] = None parameters['batch_size'] = 16 parameters['embedding_size'] = 128 parameters['recurrent_gate_size'] = 256 parameters['dropout'] = 0.1 parameters['test'] = 0.1 parameters['min_length'] = 200 parameters['max_length'] = 1000 parameters['num_train'] = 10000 parameters['epochs'] = 50 parameters['save_freq'] = 3 parameters['file_label'] = "" # loop over options: for option, argument in opts: if option == "-v": parameters[verbose] = True elif option in ("-h", "--help"): usage() elif option in ("-o", "--output"): parameters['output'] = argument elif option in ("-w", "--weights"): parameters['weights'] = argument elif option in ("-E", "--epochs"): parameters['epochs'] = int(argument) elif option in ("-b", "--batch_size"): parameters['batch_size'] = int(argument) elif option in ("-e", "--embedding_size"): parameters['embedding_size'] = int(argument) elif option in ("-d", "--dropout"): parameters['dropout'] = float(argument) elif option in ("-t", "--test"): parameters['test'] = float(argument) elif option in ("-l", "--min_length"): parameters['min_length'] = int(argument) elif option in ("-L", "--max_length"): parameters['max_length'] = int(argument) elif option in ("-n", "--num_train"): parameters['num_train'] = int(argument) elif option in ("-f", "--file_label"): parameters['file_label'] = argument else: assert False, "unhandled option" ########## ## MAIN ## ########## print "Reading input files..." sequences = fasta.load_fasta(fastaFile, parameters['min_length']) if not parameters['weights']: print "No weights given with -w parameter.\n" sys.exit() modelFiles = parameters['weights'].split(',') models = [] for modelFile in modelFiles: print "Building model..." mRNN = model.build_model(modelFile, parameters['embedding_size'], parameters['recurrent_gate_size'], 5, parameters['dropout']) models.append(mRNN) print "Evaluating sequences..." output = fastaFile + ".mRNNensemble" if parameters['output']: output = parameters['output'] evaluate.ensemble_evaluate_sequences(models, sequences, output, parameters['batch_size'])
def main(): # Options opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:l:L:s:", [ "help", "output=", "weights=", "epochs=", "batch_size=", "embedding_size=", "recurrent_gate_size=", "dropout=", "test=", "min_length=", "max_length=", "early_stopping=" ]) if len(files) != 4: usage() posFastaFile = files[0] negFastaFile = files[1] posValFasta = files[2] negValFasta = files[3] print "using positive file: ", posFastaFile print "using negative file: ", negFastaFile print "using positive validation file: ", posValFasta print "using negative validation file: ", negValFasta # Defaults: parameters = {} parameters['output'] = None parameters['verbose'] = False parameters['weights'] = None parameters['batch_size'] = 16 parameters['embedding_size'] = 128 parameters['recurrent_gate_size'] = 256 parameters['dropout'] = 0.5 parameters['test'] = 0.1 parameters['min_length'] = 200 parameters['max_length'] = 1000 parameters['num_train'] = 10000 parameters['epochs'] = 25 parameters['save_freq'] = 1 parameters['early_stopping'] = None # loop over options: for option, argument in opts: if option == "-v": parameters[verbose] = True elif option in ("-h", "--help"): usage() elif option in ("-o", "--output"): parameters['output'] = argument elif option in ("-w", "--weights"): parameters['weights'] = argument elif option in ("-E", "--epochs"): parameters['epochs'] = int(argument) elif option in ("-b", "--batch_size"): parameters['batch_size'] = int(argument) elif option in ("-e", "--embedding_size"): parameters['embedding_size'] = int(argument) elif option in ("-r", "--recurrent_gate_size"): parameters['recurrent_gate_size'] = int(argument) elif option in ("-d", "--dropout"): parameters['dropout'] = float(argument) elif option in ("-t", "--test"): parameters['test'] = float(argument) elif option in ("-l", "--min_length"): parameters['min_length'] = int(argument) elif option in ("-L", "--max_length"): parameters['max_length'] = int(argument) elif option in ("-n", "--num_train"): parameters['num_train'] = int(argument) elif option in ("-s", "--early_stopping"): if argument is not None: argument = int(argument) parameters['early_stopping'] = argument else: assert False, "unhandled option" ########## ## MAIN ## ########## print "Reading input files..." positives = fasta.load_fasta(posFastaFile, parameters['min_length']) negatives = fasta.load_fasta(negFastaFile, parameters['min_length']) valpos = fasta.load_fasta(posValFasta, parameters['min_length']) valneg = fasta.load_fasta(negValFasta, parameters['min_length']) train = positives, negatives val = valpos, valneg print "Building new model..." mRNN = model.build_model(parameters['weights'], parameters['embedding_size'], parameters['recurrent_gate_size'], 5, parameters['dropout']) print inspect.getmodule(mRNN.__class__) print "Training model..." mRNN = model.train_model(mRNN, train, val, parameters['epochs'], parameters['output'], parameters['max_length'], parameters['save_freq'], parameters['early_stopping']) return mRNN
def mutantsFromFasta(inputFile): 'Loads the first sequence from the given file, mutates it.' seq = fasta.load_fasta(inputFile, 0)[0][0] seqs = zip(*pairMutate(seq)) return seq, seqs
def load_test_seqs(): seqs = load_fasta("resources/test.fa") return seqs
def main(): # Options opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:p:f:", ["help", "output=", "weights=", "epochs=", "batch_size=", "embedding_size=", "recurrent_gate_size=", "dropout=", "test=", "min_length=", "max_length=", "file_label=", ]) if len(files) != 2: usage() posFastaFile = files[0] negFastaFile = files[1] print "using positive file: ", posFastaFile print "using negative file: ", negFastaFile # Defaults: parameters = {} parameters['output'] = None parameters['verbose'] = False parameters['weights'] = None parameters['batch_size'] = 16 parameters['embedding_size'] = 128 parameters['recurrent_gate_size'] = 256 parameters['dropout'] = 0.1 parameters['test'] = 0.1 parameters['min_length'] = 200 parameters['max_length'] = 1000 parameters['num_train'] = 10000 parameters['epochs'] = 50 parameters['save_freq'] = 3 parameters['file_label'] = "" # loop over options: for option, argument in opts: if option == "-v": parameters[verbose] = True elif option in ("-h", "--help"): usage() elif option in ("-o", "--output"): parameters['output'] = argument elif option in ("-w", "--weights"): parameters['weights'] = argument elif option in ("-E", "--epochs"): parameters['epochs'] = int(argument) elif option in ("-b", "--batch_size"): parameters['batch_size'] = int(argument) elif option in ("-e", "--embedding_size"): parameters['embedding_size'] = int(argument) elif option in ("-d", "--dropout"): parameters['dropout'] = float(argument) elif option in ("-t", "--test"): parameters['test'] = float(argument) elif option in ("-l", "--min_length"): parameters['min_length'] = int(argument) elif option in ("-L", "--max_length"): parameters['max_length'] = int(argument) elif option in ("-n", "--num_train"): parameters['num_train'] = int(argument) elif option in ("-f", "--file_label"): parameters['file_label'] = argument else: assert False, "unhandled option" ########## ## MAIN ## ########## print "Reading input files..." positives = fasta.load_fasta(posFastaFile,parameters['min_length']) negatives = fasta.load_fasta(negFastaFile,parameters['min_length']) test = positives,negatives print "Building model..." if not parameters['weights']: print "No weights given with -w parameter.\n" sys.exit() mRNN = model.build_model(parameters['weights'],parameters['embedding_size'],parameters['recurrent_gate_size'],5,parameters['dropout']) print "Evaluating model..." conf_mat = evaluate.evaluate_model(mRNN, test, parameters['batch_size']) acc = evaluate.process_results(conf_mat,parameters)
def main(): # Options parser = argparse.ArgumentParser( description= '''Takes a fasta file and a weights file for an RNN model as input. After loading the RNN, each transcript in the fasta file is truncated at every possible position and the model predicts the score. The output is a tab-delimited file with the following fields: transcript name, truncation position, the model's prediction that the truncated sequence is coding, the log odds of that probability, and information about where the position is in the transcript (5' UTR, CDS, 3' UTR, or none). If the -s option is used, the transcipt name is in the filename, so the field is eliminated.''' ) parser.add_argument('fasta', help='''Fasta file of sequences for truncation.''') parser.add_argument( 'weights', help= '''File containing model weights. This specifies which model to use.''' ) parser.add_argument( 'output', help= '''Output name. By default, this is the file where results will be written. If using the -s option, it is the directory where results will be written. By default, this script will not run if the output file or directory already exists. Use the -o option to overwrite an existing file, or to potentially overwrite files in the output directory if using the -s option. Note that with the -s option, file names are chosen based on the defline of the transcript. If the -s and -o options are used together, files in the output directory will not be deleted, but may (or may not) be overwritten.''' ) parser.add_argument( '-o', help= '''Use this option if you want to overwrite an existing output file, or use an existing output directory, potentially (but not certainly) overwriting files in it.''', action='store_true') parser.add_argument( '-e', help= '''Use this option to write just the Ensembl transcript ID instead of the full defline.''', action='store_true') parser.add_argument( '-s', help= '''Use this option to split the output into individual files named based on the defline of the transcript. If using this option, the output argument should be the name of a directory, not a file.''', action='store_true') args = parser.parse_args() ########## ## MAIN ## ########## if not args.o and os.path.exists(args.output): if args.s: field1 = 'directory' else: field1 = 'file' raise Exception( args.output + ''' already exists! Please choose a different {0} name or use the -o option to overwrite. Use the command python {1} -h for more details.''' .format(field1, sys.argv[0])) orig_dir = os.getcwd() if args.e: transpat = re.compile('ENST\d*.\d*') cds_loc = re.compile('CDS:(\d+)-(\d+)') print "Reading input files..." full_seqs = fasta.load_fasta(args.fasta, 0) if args.s: try: os.mkdir(args.output) except OSError: #os.mkdir raises this error if the path already exists. We should not see this error unless using the args.o option. #if the assertion is true, then everything's ok, and we can just ignore the error and move to the directory since it already exists. assert args.o == True trunc = [] for seq, name in full_seqs: coords = cds_loc.search(name) if coords: #there is a CDS field in the defline coords = map(int, coords.group(1, 2)) for i in xrange(len(seq)): if not coords: pos_class = 'NA' elif i < coords[ 0] - 1: #subtract one because info in defline is 1-based pos_class = 'UTR5' elif i < coords[1]: pos_class = 'CDS' else: pos_class = 'UTR3' trunc.append((seq[:i + 1], str(i), name, pos_class)) mRNN = model.build_model(args.weights) print "Evaluating sequences..." seqs, pos, name, pos_class = zip(*trunc) cds_coords = [] if args.e: names = [transpat.search(n).group() for n in name] else: names = [line.strip() for line in name] probs = mRNN.batch_predict(seqs) logodds = logit(probs) #stringify numbers probs = map(str, probs) logodds = map(str, logodds) if args.s: os.chdir(args.output) lines = zip(pos, probs, logodds, pos_class) lines = ['\t'.join(line) for line in lines] #put lines in arrays keyed by transcript name linedict = {name: [] for name in set(names)} for name, line in zip(names, lines): linedict[name].append(line) for name in linedict: with open(name + '.trunc.txt', 'w') as out: out.write('\n'.join(linedict[name])) #go back to original directory os.chdir(orig_dir) else: lines = zip(names, pos, probs, logodds, pos_class) lines = ['\t'.join(line) for line in lines] with open(args.output, 'w') as out: out.write('\n'.join(lines))
def main(): # Options opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:p:f:", [ "help", "output=", "weights=", "epochs=", "batch_size=", "embedding_size=", "recurrent_gate_size=", "dropout=", "test=", "min_length=", "max_length=", "file_label=", ]) if len(files) != 2: usage() posFastaFile = files[0] negFastaFile = files[1] print "using positive file: ", posFastaFile print "using negative file: ", negFastaFile # Defaults: parameters = {} parameters['output'] = None parameters['verbose'] = False parameters['weights'] = None parameters['batch_size'] = 16 parameters['embedding_size'] = 128 parameters['recurrent_gate_size'] = 256 parameters['dropout'] = 0.1 parameters['test'] = 0.1 parameters['min_length'] = 200 parameters['max_length'] = 1000 parameters['num_train'] = 10000 parameters['epochs'] = 50 parameters['save_freq'] = 3 parameters['file_label'] = "" # loop over options: for option, argument in opts: if option == "-v": parameters[verbose] = True elif option in ("-h", "--help"): usage() elif option in ("-o", "--output"): parameters['output'] = argument elif option in ("-w", "--weights"): parameters['weights'] = argument elif option in ("-E", "--epochs"): parameters['epochs'] = int(argument) elif option in ("-b", "--batch_size"): parameters['batch_size'] = int(argument) elif option in ("-e", "--embedding_size"): parameters['embedding_size'] = int(argument) elif option in ("-d", "--dropout"): parameters['dropout'] = float(argument) elif option in ("-t", "--test"): parameters['test'] = float(argument) elif option in ("-l", "--min_length"): parameters['min_length'] = int(argument) elif option in ("-L", "--max_length"): parameters['max_length'] = int(argument) elif option in ("-n", "--num_train"): parameters['num_train'] = int(argument) elif option in ("-f", "--file_label"): parameters['file_label'] = argument else: assert False, "unhandled option" ########## ## MAIN ## ########## print "Reading input files..." positives = fasta.load_fasta(posFastaFile, parameters['min_length']) negatives = fasta.load_fasta(negFastaFile, parameters['min_length']) test = positives, negatives if not parameters['weights']: print "No weights given with -w parameter.\n" sys.exit() modelFiles = parameters['weights'].split(',') models = [] for modelFile in modelFiles: print "Building model..." mRNN = model.build_model(modelFile, parameters['embedding_size'], parameters['recurrent_gate_size'], 5, parameters['dropout']) models.append(mRNN) print "Evaluating model..." conf_mat = evaluate.evaluate_multi_model(models, test, parameters['batch_size']) [[TN, FP], [FN, TP]] = conf_mat acc = (TP + TN) / (TP + TN + FP + FN) sens = TP / (TP + FN) spec = TN / (TN + FP) outFile = "multi_predict" + ".acc.txt" if parameters['file_label']: outFile = "multi_predict" + "." + parameters['file_label'] + ".acc.txt" F = open(outFile, 'w') F.write("%s\tACC\t%.4f\n" % (parameters['weights'], acc)) F.write("%s\tSPEC\t%.4f\n" % (parameters['weights'], spec)) F.write("%s\tSENS\t%.4f\n" % (parameters['weights'], sens)) F.write("%d\t%d\n%d\t%d\n" % (TN, FP, FN, TP)) F.close()
def main(): # Options parser = argparse.ArgumentParser( description= '''Takes a fasta file and a weights file for an RNN model as input. After loading the RNN, the 3' UTR, CDS, and 5' UTR are individually shuffled, and the entire transcript is scored. The number of shuffles done is determined with the -n option. The model predicts the probability that the shuffled and unshuffled transcripts are protein coding, and Z-scores are computed for the 3' UTR, CDS, and 5' UTR. The output is a tab-delimited file with the following fields: transcript name, 3' UTR Z-score, CDS Z-score, 5' UTR Z-score, 3' UTR length, CDS length, 5' UTR length.''') parser.add_argument('fasta', help='''Fasta file of sequences for shuffling.''') parser.add_argument( 'weights', help= '''File containing model weights. This specifies which model to use.''' ) parser.add_argument( 'output', help='''Output name. This is the file where results will be written. By default, this script will not run if the output file already exists. Use the -o option to overwrite an existing file.''' ) parser.add_argument( '-o', help= '''Use this option if you want to overwrite an existing output file.''', action='store_true') parser.add_argument( '-e', help= '''Use this option to write just the Ensembl transcript ID instead of the full defline.''', action='store_true') parser.add_argument( '-n', help='''Number of times to shuffle each segment. Default 20.''', default=20, type=int) parser.add_argument( '-p', help= '''Base of filenames to plot to. The plot is a matplotlib plot of z-score vs sequence length. The -o option also applies to this file.''') args = parser.parse_args() ########## ## MAIN ## ########## if not args.o: if os.path.exists(args.output): field1 = 'file' raise Exception( args.output + ''' already exists! Please choose a different {0} name or use the -o option to overwrite. Use the command python {1} -h for more details.''' .format(field1, sys.argv[0])) if args.p and os.path.exists(args.p): raise Exception( args.p + ''' already exists! Please choose a different {0} name or use the -o option to overwrite. Use the command python {1} -h for more details.''' .format(field1, sys.argv[0])) orig_dir = os.getcwd() if args.e: transpat = re.compile('ENST\d*.\d*') #never mind, CDS is sufficient cds_loc = re.compile('CDS:(\d+)-(\d+)') print "Reading input files..." full_seqs = fasta.load_fasta(args.fasta, 0) seqs = [] seq_lens = [] for seq, name in full_seqs: coords = cds_loc.search(name) #do shuffling here if coords: #there is a CDS field in the defline coords = map(int, coords.group(1, 2)) else: print name continue utr5 = seq[:coords[0] - 1] cds = seq[coords[0] - 1:coords[1]] utr3 = seq[coords[1]:] seqs.append((seq, 'orig', name)) utr5_shuffle = [ shuf_utr5 + cds + utr3 for shuf_utr5 in shuffle_seq(utr5, args.n) ] cds_shuffle = [ utr5 + shuf_cds + utr3 for shuf_cds in shuffle_seq(cds, args.n) ] utr3_shuffle = [ utr5 + cds + shuf_utr3 for shuf_utr3 in shuffle_seq(utr3, args.n) ] for seq_type, group in zip(('utr5', 'cds', 'utr3'), (utr5_shuffle, cds_shuffle, utr3_shuffle)): for s in group: seqs.append((s, seq_type, name)) seq_lens.append(map(len, [utr5, cds, utr3])) mRNN = model.build_model(args.weights) print "Evaluating sequences..." seqs, seq_type, name = zip(*seqs) cds_coords = [] if args.e: names = [transpat.search(n).group() for n in name] else: names = [line.strip() for line in name] probs = mRNN.batch_predict(seqs) #delete sequences here, since they are not needed anymore del seqs logodds = logit(probs) Zscores = [] #calculate z-scores i = 0 while i < len(probs): batch = [] curr_name = names[i] while curr_name == names[i]: batch.append((probs[i], seq_type[i])) i += 1 if i == len(probs): break assert batch[0][1] == 'orig' orig = batch[0][0] tmp = {'name': curr_name, 'utr5': None, 'cds': None, 'utr3': None} for j in xrange(3): try: sub_batch = batch[1 + j * args.n:1 + (j + 1) * args.n] tmp[sub_batch[0][1]] = z_score(orig, zip(*sub_batch)[0]) except IndexError: pass Zscores.append([tmp[key] for key in ['name', 'utr5', 'cds', 'utr3']]) comments = '#fasta: {0}, weights: {1}, number of shuffles: {2}'.format( args.fasta, args.weights, args.n) lines = [ comments, "transcript\t5' UTR Z-score\tCDS Z-score\t3' UTR Z-score\t5' UTR length\tCDS length\t3' UTR length" ] for z, lens in zip(Zscores, seq_lens): line = map(str, z + lens) lines.append('\t'.join(line)) with open(args.output, 'w') as out: out.write('\n'.join(lines)) if args.p: plot_zscore_scatter(Zscores, seq_lens, args.p) plot_zscore_histogram(Zscores, seq_lens, args.p)