Ejemplo n.º 1
0
def main():

    # Options
    parser = argparse.ArgumentParser()
    parser.add_argument('fasta',
                        help='''Fasta file of sequences for truncation.''')
    parser.add_argument(
        'weights',
        help=
        '''File containing model weights. This specifies which model to use.'''
    )
    parser.add_argument(
        'output',
        help='''File where results will be written. By default, this script will
		not run if the output file already exists. Use the -o option to overwrite an existing file.'''
    )
    parser.add_argument(
        '-o',
        help=
        '''Use this option if you want to overwrite an existing output file.''',
        action='store_true')
    parser.add_argument(
        '-e',
        help=
        '''Use this option to write just the Ensembl transcript ID instead of 
		the full header.''',
        action='store_true')
    args = parser.parse_args()

    ##########
    ## MAIN ##
    ##########
    lookup = dict(zip(range(5), 'NATCG'))
    if not args.o and os.path.exists(args.output):
        raise Exception(args.output + ' already exists!')
    if args.e:
        transpat = re.compile('ENST\d*.\d*')
    print "Reading input files..."
    full_seqs = fasta.load_fasta(args.fasta, 0)
    mut = []
    for seq, name in full_seqs:
        for i in xrange(len(seq)):
            for b in xrange(1, 5):
                if b != seq[i]:
                    mut_seq = seq[:i] + [b] + seq[i + 1:]
                    mut.append((mut_seq, str(i), lookup[b], name))
    mRNN = model.build_model(args.weights)
    lines = []
    print "Evaluating sequences..."
    seqs, pos, base, name = zip(*mut)
    if args.e:
        name = [transpat.search(n).group() for n in name]
    scores = mRNN.batch_predict(seqs, True)
    scores = map(str, scores)
    lines = zip(name, pos, base, scores)
    lines = ['\t'.join(line) for line in lines]
    with open(args.output, 'w') as out:
        out.write('\n'.join(lines))
Ejemplo n.º 2
0
def train(posFastaFile, negFastaFile, posValFasta, negValFasta, parameters):
    print "Reading input files..."
    positives = fasta.load_fasta(posFastaFile, parameters['min_length'])
    negatives = fasta.load_fasta(negFastaFile, parameters['min_length'])
    valpos = fasta.load_fasta(posValFasta, parameters['min_length'])
    valneg = fasta.load_fasta(negValFasta, parameters['min_length'])
    train = positives, negatives
    val = valpos, valneg
    print "Building new model..."
    mRNN = model.build_model(parameters['weights'],
                             parameters['embedding_size'],
                             parameters['recurrent_gate_size'], 5,
                             parameters['dropout'])
    print inspect.getmodule(mRNN.__class__)
    print "Training model..."
    mRNN = model.train_model(mRNN, train, val, parameters['epochs'],
                             parameters['output'], parameters['max_length'],
                             parameters['save_freq'],
                             parameters['early_stopping'])
    return mRNN
Ejemplo n.º 3
0
def load_file(input_filename, min_peptide_length=9, max_peptide_length=31):
    """
    Load mutatated peptides from FASTA, VCF, or MAF file.
    For the latter two formats, expand their variants across all
    annotated transcripts.

    Parameters
    --------

    input_filename : str

    min_peptide_length : int

    max_peptide_length : int

    Returns a dataframe with columns:
        - chr : chomosome
        - pos : position in the chromosome
        - ref : reference DNA
        - alt : alternate DNA
        - info : gene name and entrez gene ID
        - stable_id_transcript : Ensembl transcript ID
        - SourceSequence : region of protein around mutation
        - MutationStart : first amino acid modified
        - MutationEnd : last mutated amino acid
        - GeneMutationInfo : original genetic variant e.g. chr3 g.484899 C>T
        - PeptideMutationInfo : annotation e.g. V600E
    """

    if input_filename.endswith(".fasta") \
            or input_filename.endswith(".fa"):
        return load_fasta(input_filename)

    vcf_df = load_variants(input_filename)
    vcf_df = vcf_df.drop_duplicates()

    return expand_transcripts(
        vcf_df,
        input_filename,
        min_peptide_length = min_peptide_length,
        max_peptide_length = max_peptide_length)
Ejemplo n.º 4
0
def load_file(input_filename, min_peptide_length=9, max_peptide_length=31):
    """
    Load mutatated peptides from FASTA, VCF, or MAF file.
    For the latter two formats, expand their variants across all
    annotated transcripts.

    Parameters
    --------

    input_filename : str

    min_peptide_length : int

    max_peptide_length : int

    Returns a dataframe with columns:
        - chr : chomosome
        - pos : position in the chromosome
        - ref : reference DNA
        - alt : alternate DNA
        - info : gene name and entrez gene ID
        - stable_id_transcript : Ensembl transcript ID
        - SourceSequence : region of protein around mutation
        - MutationStart : first amino acid modified
        - MutationEnd : last mutated amino acid
        - GeneMutationInfo : original genetic variant e.g. chr3 g.484899 C>T
        - PeptideMutationInfo : annotation e.g. V600E
    """

    if input_filename.endswith(".fasta") \
            or input_filename.endswith(".fa"):
        return load_fasta(input_filename)

    vcf_df = load_variants(input_filename)
    vcf_df = vcf_df.drop_duplicates()

    return expand_transcripts(vcf_df,
                              input_filename,
                              min_peptide_length=min_peptide_length,
                              max_peptide_length=max_peptide_length)
Ejemplo n.º 5
0
def main():
    # Options
    opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:p:f:", [
        "help",
        "output=",
        "weights=",
        "epochs=",
        "batch_size=",
        "embedding_size=",
        "recurrent_gate_size=",
        "dropout=",
        "test=",
        "min_length=",
        "max_length=",
        "file_label=",
    ])
    if len(files) != 1:
        usage()

    fastaFile = files[0]
    print "using fasta file: ", fastaFile

    # Defaults:
    parameters = {}
    parameters['output'] = None
    parameters['verbose'] = False
    parameters['weights'] = None
    parameters['batch_size'] = 16
    parameters['embedding_size'] = 128
    parameters['recurrent_gate_size'] = 256
    parameters['dropout'] = 0.1
    parameters['test'] = 0.1
    parameters['min_length'] = 200
    parameters['max_length'] = 1000
    parameters['num_train'] = 10000
    parameters['epochs'] = 50
    parameters['save_freq'] = 3
    parameters['file_label'] = ""

    # loop over options:
    for option, argument in opts:
        if option == "-v":
            parameters[verbose] = True
        elif option in ("-h", "--help"):
            usage()
        elif option in ("-o", "--output"):
            parameters['output'] = argument
        elif option in ("-w", "--weights"):
            parameters['weights'] = argument
        elif option in ("-E", "--epochs"):
            parameters['epochs'] = int(argument)
        elif option in ("-b", "--batch_size"):
            parameters['batch_size'] = int(argument)
        elif option in ("-e", "--embedding_size"):
            parameters['embedding_size'] = int(argument)
        elif option in ("-d", "--dropout"):
            parameters['dropout'] = float(argument)
        elif option in ("-t", "--test"):
            parameters['test'] = float(argument)
        elif option in ("-l", "--min_length"):
            parameters['min_length'] = int(argument)
        elif option in ("-L", "--max_length"):
            parameters['max_length'] = int(argument)
        elif option in ("-n", "--num_train"):
            parameters['num_train'] = int(argument)
        elif option in ("-f", "--file_label"):
            parameters['file_label'] = argument
        else:
            assert False, "unhandled option"

    ##########
    ## MAIN ##
    ##########

    print "Reading input files..."
    sequences = fasta.load_fasta(fastaFile, parameters['min_length'])
    if not parameters['weights']:
        print "No weights given with -w parameter.\n"
        sys.exit()
    modelFiles = parameters['weights'].split(',')
    models = []
    for modelFile in modelFiles:
        print "Building model..."
        mRNN = model.build_model(modelFile, parameters['embedding_size'],
                                 parameters['recurrent_gate_size'], 5,
                                 parameters['dropout'])
        models.append(mRNN)
    print "Evaluating sequences..."
    output = fastaFile + ".mRNNensemble"
    if parameters['output']:
        output = parameters['output']
    evaluate.ensemble_evaluate_sequences(models, sequences, output,
                                         parameters['batch_size'])
Ejemplo n.º 6
0
def main():
    # Options
    opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:l:L:s:", [
        "help", "output=", "weights=", "epochs=", "batch_size=",
        "embedding_size=", "recurrent_gate_size=", "dropout=", "test=",
        "min_length=", "max_length=", "early_stopping="
    ])
    if len(files) != 4:
        usage()
    posFastaFile = files[0]
    negFastaFile = files[1]
    posValFasta = files[2]
    negValFasta = files[3]
    print "using positive file: ", posFastaFile
    print "using negative file: ", negFastaFile
    print "using positive validation file: ", posValFasta
    print "using negative validation file: ", negValFasta
    # Defaults:
    parameters = {}
    parameters['output'] = None
    parameters['verbose'] = False
    parameters['weights'] = None
    parameters['batch_size'] = 16
    parameters['embedding_size'] = 128
    parameters['recurrent_gate_size'] = 256
    parameters['dropout'] = 0.5
    parameters['test'] = 0.1
    parameters['min_length'] = 200
    parameters['max_length'] = 1000
    parameters['num_train'] = 10000
    parameters['epochs'] = 25
    parameters['save_freq'] = 1
    parameters['early_stopping'] = None
    # loop over options:
    for option, argument in opts:
        if option == "-v":
            parameters[verbose] = True
        elif option in ("-h", "--help"):
            usage()
        elif option in ("-o", "--output"):
            parameters['output'] = argument
        elif option in ("-w", "--weights"):
            parameters['weights'] = argument
        elif option in ("-E", "--epochs"):
            parameters['epochs'] = int(argument)
        elif option in ("-b", "--batch_size"):
            parameters['batch_size'] = int(argument)
        elif option in ("-e", "--embedding_size"):
            parameters['embedding_size'] = int(argument)
        elif option in ("-r", "--recurrent_gate_size"):
            parameters['recurrent_gate_size'] = int(argument)
        elif option in ("-d", "--dropout"):
            parameters['dropout'] = float(argument)
        elif option in ("-t", "--test"):
            parameters['test'] = float(argument)
        elif option in ("-l", "--min_length"):
            parameters['min_length'] = int(argument)
        elif option in ("-L", "--max_length"):
            parameters['max_length'] = int(argument)
        elif option in ("-n", "--num_train"):
            parameters['num_train'] = int(argument)
        elif option in ("-s", "--early_stopping"):
            if argument is not None:
                argument = int(argument)
            parameters['early_stopping'] = argument
        else:
            assert False, "unhandled option"

    ##########
    ## MAIN ##
    ##########

    print "Reading input files..."
    positives = fasta.load_fasta(posFastaFile, parameters['min_length'])
    negatives = fasta.load_fasta(negFastaFile, parameters['min_length'])
    valpos = fasta.load_fasta(posValFasta, parameters['min_length'])
    valneg = fasta.load_fasta(negValFasta, parameters['min_length'])
    train = positives, negatives
    val = valpos, valneg
    print "Building new model..."
    mRNN = model.build_model(parameters['weights'],
                             parameters['embedding_size'],
                             parameters['recurrent_gate_size'], 5,
                             parameters['dropout'])
    print inspect.getmodule(mRNN.__class__)
    print "Training model..."
    mRNN = model.train_model(mRNN, train, val, parameters['epochs'],
                             parameters['output'], parameters['max_length'],
                             parameters['save_freq'],
                             parameters['early_stopping'])
    return mRNN
Ejemplo n.º 7
0
def mutantsFromFasta(inputFile):
    'Loads the first sequence from the given file, mutates it.'
    seq = fasta.load_fasta(inputFile, 0)[0][0]
    seqs = zip(*pairMutate(seq))
    return seq, seqs
Ejemplo n.º 8
0
Archivo: tests.py Proyecto: hillst/mRNN
def load_test_seqs():
    seqs = load_fasta("resources/test.fa")
    return seqs
Ejemplo n.º 9
0
def main():
    # Options
    opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:p:f:", ["help",
                                                                         "output=",
                                                                         "weights=",
                                                                         "epochs=",
                                                                         "batch_size=",
                                                                         "embedding_size=",
                                                                         "recurrent_gate_size=",
                                                                         "dropout=",
                                                                         "test=",
                                                                         "min_length=",
                                                                         "max_length=",
                                                                         "file_label=",
                                                                     ])
    if len(files) != 2:
        usage()
        
    posFastaFile = files[0]
    negFastaFile = files[1]
    print "using positive file: ", posFastaFile
    print "using negative file: ", negFastaFile
 
    # Defaults:
    parameters = {}
    parameters['output'] = None
    parameters['verbose'] = False
    parameters['weights'] = None
    parameters['batch_size'] = 16
    parameters['embedding_size'] = 128
    parameters['recurrent_gate_size'] = 256
    parameters['dropout'] = 0.1
    parameters['test'] = 0.1
    parameters['min_length'] = 200
    parameters['max_length'] = 1000
    parameters['num_train'] = 10000
    parameters['epochs'] = 50
    parameters['save_freq'] = 3
    parameters['file_label'] = ""

    # loop over options:
    for option, argument in opts:
        if option == "-v":
            parameters[verbose] = True
        elif option in ("-h", "--help"):
            usage()
        elif option in ("-o", "--output"):
            parameters['output'] = argument
        elif option in ("-w", "--weights"):
            parameters['weights'] = argument
        elif option in ("-E", "--epochs"):
            parameters['epochs'] = int(argument)
        elif option in ("-b", "--batch_size"):
            parameters['batch_size'] = int(argument)
        elif option in ("-e", "--embedding_size"):
            parameters['embedding_size'] = int(argument)
        elif option in ("-d", "--dropout"):
            parameters['dropout'] = float(argument)
        elif option in ("-t", "--test"):
            parameters['test'] = float(argument)
        elif option in ("-l", "--min_length"):
            parameters['min_length'] = int(argument)
        elif option in ("-L", "--max_length"):
            parameters['max_length'] = int(argument)
        elif option in ("-n", "--num_train"):
            parameters['num_train'] = int(argument)
        elif option in ("-f", "--file_label"):
            parameters['file_label'] = argument
        else:
            assert False, "unhandled option"

    ##########
    ## MAIN ##
    ##########

    print "Reading input files..."
    positives = fasta.load_fasta(posFastaFile,parameters['min_length'])
    negatives = fasta.load_fasta(negFastaFile,parameters['min_length'])
    test = positives,negatives
    print "Building model..."
    if not parameters['weights']:
        print "No weights given with -w parameter.\n"
        sys.exit()
    mRNN = model.build_model(parameters['weights'],parameters['embedding_size'],parameters['recurrent_gate_size'],5,parameters['dropout'])
    print "Evaluating model..."
    conf_mat =  evaluate.evaluate_model(mRNN, test, parameters['batch_size'])
    acc = evaluate.process_results(conf_mat,parameters)
Ejemplo n.º 10
0
def main():

    # Options
    parser = argparse.ArgumentParser(
        description=
        '''Takes a fasta file and a weights file for an RNN model as input.
		After loading the RNN, each transcript in the fasta file is truncated at every possible position and the model
		predicts the score. The output is a tab-delimited file with the following fields: transcript name, truncation position,
		the model's prediction that the truncated sequence is coding, the log odds of that probability, and information about
		where the position is in the transcript (5' UTR, CDS, 3' UTR, or none). If the -s
		option is used, the transcipt name is in the filename, so the field is eliminated.'''
    )
    parser.add_argument('fasta',
                        help='''Fasta file of sequences for truncation.''')
    parser.add_argument(
        'weights',
        help=
        '''File containing model weights. This specifies which model to use.'''
    )
    parser.add_argument(
        'output',
        help=
        '''Output name. By default, this is the file where results will be written. 
		If using the -s option, it is the directory where results will be written. By default, this script will
		not run if the output file or directory already exists. Use the -o option to overwrite an existing file,
		or to potentially overwrite files in the output directory if using the -s option. Note that with the -s option,
		file names are chosen based on the defline of the transcript. If the -s and -o options are used together,
		files in the output directory will not be deleted, but may (or may not) be overwritten.'''
    )
    parser.add_argument(
        '-o',
        help=
        '''Use this option if you want to overwrite an existing output file, or use an 
		existing output directory, potentially (but not certainly) overwriting files in it.''',
        action='store_true')
    parser.add_argument(
        '-e',
        help=
        '''Use this option to write just the Ensembl transcript ID instead of 
		the full defline.''',
        action='store_true')
    parser.add_argument(
        '-s',
        help=
        '''Use this option to split the output into individual files named based on the 
		defline of the transcript. If using this option, the output argument should be the name of a directory,
		not a file.''',
        action='store_true')
    args = parser.parse_args()

    ##########
    ## MAIN ##
    ##########

    if not args.o and os.path.exists(args.output):
        if args.s:
            field1 = 'directory'
        else:
            field1 = 'file'
        raise Exception(
            args.output +
            ''' already exists! Please choose a different {0} name or use the -o option to overwrite. Use the command python {1} -h for more details.'''
            .format(field1, sys.argv[0]))
    orig_dir = os.getcwd()
    if args.e:
        transpat = re.compile('ENST\d*.\d*')
    cds_loc = re.compile('CDS:(\d+)-(\d+)')
    print "Reading input files..."
    full_seqs = fasta.load_fasta(args.fasta, 0)
    if args.s:
        try:
            os.mkdir(args.output)
        except OSError:
            #os.mkdir raises this error if the path already exists. We should not see this error unless using the args.o option.
            #if the assertion is true, then everything's ok, and we can just ignore the error and move to the directory since it already exists.
            assert args.o == True
    trunc = []
    for seq, name in full_seqs:
        coords = cds_loc.search(name)
        if coords:
            #there is a CDS field in the defline
            coords = map(int, coords.group(1, 2))
        for i in xrange(len(seq)):
            if not coords:
                pos_class = 'NA'
            elif i < coords[
                    0] - 1:  #subtract one because info in defline is 1-based
                pos_class = 'UTR5'
            elif i < coords[1]:
                pos_class = 'CDS'
            else:
                pos_class = 'UTR3'
            trunc.append((seq[:i + 1], str(i), name, pos_class))
    mRNN = model.build_model(args.weights)
    print "Evaluating sequences..."
    seqs, pos, name, pos_class = zip(*trunc)
    cds_coords = []
    if args.e:
        names = [transpat.search(n).group() for n in name]
    else:
        names = [line.strip() for line in name]
    probs = mRNN.batch_predict(seqs)
    logodds = logit(probs)
    #stringify numbers
    probs = map(str, probs)
    logodds = map(str, logodds)

    if args.s:
        os.chdir(args.output)
        lines = zip(pos, probs, logodds, pos_class)
        lines = ['\t'.join(line) for line in lines]
        #put lines in arrays keyed by transcript name
        linedict = {name: [] for name in set(names)}
        for name, line in zip(names, lines):
            linedict[name].append(line)
        for name in linedict:
            with open(name + '.trunc.txt', 'w') as out:
                out.write('\n'.join(linedict[name]))
    #go back to original directory
        os.chdir(orig_dir)
    else:
        lines = zip(names, pos, probs, logodds, pos_class)
        lines = ['\t'.join(line) for line in lines]
        with open(args.output, 'w') as out:
            out.write('\n'.join(lines))
Ejemplo n.º 11
0
def main():
    # Options
    opts, files = getopt.getopt(sys.argv[1:], "hvo:w:E:b:e:r:d:t:p:f:", [
        "help",
        "output=",
        "weights=",
        "epochs=",
        "batch_size=",
        "embedding_size=",
        "recurrent_gate_size=",
        "dropout=",
        "test=",
        "min_length=",
        "max_length=",
        "file_label=",
    ])
    if len(files) != 2:
        usage()

    posFastaFile = files[0]
    negFastaFile = files[1]
    print "using positive file: ", posFastaFile
    print "using negative file: ", negFastaFile

    # Defaults:
    parameters = {}
    parameters['output'] = None
    parameters['verbose'] = False
    parameters['weights'] = None
    parameters['batch_size'] = 16
    parameters['embedding_size'] = 128
    parameters['recurrent_gate_size'] = 256
    parameters['dropout'] = 0.1
    parameters['test'] = 0.1
    parameters['min_length'] = 200
    parameters['max_length'] = 1000
    parameters['num_train'] = 10000
    parameters['epochs'] = 50
    parameters['save_freq'] = 3
    parameters['file_label'] = ""

    # loop over options:
    for option, argument in opts:
        if option == "-v":
            parameters[verbose] = True
        elif option in ("-h", "--help"):
            usage()
        elif option in ("-o", "--output"):
            parameters['output'] = argument
        elif option in ("-w", "--weights"):
            parameters['weights'] = argument
        elif option in ("-E", "--epochs"):
            parameters['epochs'] = int(argument)
        elif option in ("-b", "--batch_size"):
            parameters['batch_size'] = int(argument)
        elif option in ("-e", "--embedding_size"):
            parameters['embedding_size'] = int(argument)
        elif option in ("-d", "--dropout"):
            parameters['dropout'] = float(argument)
        elif option in ("-t", "--test"):
            parameters['test'] = float(argument)
        elif option in ("-l", "--min_length"):
            parameters['min_length'] = int(argument)
        elif option in ("-L", "--max_length"):
            parameters['max_length'] = int(argument)
        elif option in ("-n", "--num_train"):
            parameters['num_train'] = int(argument)
        elif option in ("-f", "--file_label"):
            parameters['file_label'] = argument
        else:
            assert False, "unhandled option"

    ##########
    ## MAIN ##
    ##########

    print "Reading input files..."
    positives = fasta.load_fasta(posFastaFile, parameters['min_length'])
    negatives = fasta.load_fasta(negFastaFile, parameters['min_length'])
    test = positives, negatives
    if not parameters['weights']:
        print "No weights given with -w parameter.\n"
        sys.exit()
    modelFiles = parameters['weights'].split(',')
    models = []
    for modelFile in modelFiles:
        print "Building model..."
        mRNN = model.build_model(modelFile, parameters['embedding_size'],
                                 parameters['recurrent_gate_size'], 5,
                                 parameters['dropout'])
        models.append(mRNN)
    print "Evaluating model..."
    conf_mat = evaluate.evaluate_multi_model(models, test,
                                             parameters['batch_size'])
    [[TN, FP], [FN, TP]] = conf_mat
    acc = (TP + TN) / (TP + TN + FP + FN)
    sens = TP / (TP + FN)
    spec = TN / (TN + FP)
    outFile = "multi_predict" + ".acc.txt"
    if parameters['file_label']:
        outFile = "multi_predict" + "." + parameters['file_label'] + ".acc.txt"

    F = open(outFile, 'w')
    F.write("%s\tACC\t%.4f\n" % (parameters['weights'], acc))
    F.write("%s\tSPEC\t%.4f\n" % (parameters['weights'], spec))
    F.write("%s\tSENS\t%.4f\n" % (parameters['weights'], sens))
    F.write("%d\t%d\n%d\t%d\n" % (TN, FP, FN, TP))
    F.close()
Ejemplo n.º 12
0
def main():

    # Options
    parser = argparse.ArgumentParser(
        description=
        '''Takes a fasta file and a weights file for an RNN model as input.
		After loading the RNN, the 3' UTR, CDS, and 5' UTR are individually shuffled, and the entire transcript is scored.
		The number of shuffles done is determined with the -n option. The model predicts the probability that the shuffled
		and unshuffled transcripts are protein coding, and Z-scores are computed for the 3' UTR, CDS, and 5' UTR.
		The output is a tab-delimited file with the following fields: transcript name, 3' UTR Z-score, CDS Z-score, 5' UTR Z-score,
		3' UTR length, CDS length, 5' UTR length.''')
    parser.add_argument('fasta',
                        help='''Fasta file of sequences for shuffling.''')
    parser.add_argument(
        'weights',
        help=
        '''File containing model weights. This specifies which model to use.'''
    )
    parser.add_argument(
        'output',
        help='''Output name. This is the file where results will be written. 
		By default, this script will not run if the output file already exists. Use the -o option to overwrite an existing file.'''
    )
    parser.add_argument(
        '-o',
        help=
        '''Use this option if you want to overwrite an existing output file.''',
        action='store_true')
    parser.add_argument(
        '-e',
        help=
        '''Use this option to write just the Ensembl transcript ID instead of 
		the full defline.''',
        action='store_true')
    parser.add_argument(
        '-n',
        help='''Number of times to shuffle each segment. Default 20.''',
        default=20,
        type=int)
    parser.add_argument(
        '-p',
        help=
        '''Base of filenames to plot to. The plot is a matplotlib plot of z-score 
				vs sequence length. The -o option also applies to this file.''')
    args = parser.parse_args()

    ##########
    ## MAIN ##
    ##########

    if not args.o:
        if os.path.exists(args.output):
            field1 = 'file'
            raise Exception(
                args.output +
                ''' already exists! Please choose a different {0} name or use the -o option to overwrite. Use the command python {1} -h for more details.'''
                .format(field1, sys.argv[0]))
        if args.p and os.path.exists(args.p):
            raise Exception(
                args.p +
                ''' already exists! Please choose a different {0} name or use the -o option to overwrite. Use the command python {1} -h for more details.'''
                .format(field1, sys.argv[0]))
    orig_dir = os.getcwd()
    if args.e:
        transpat = re.compile('ENST\d*.\d*')
    #never mind, CDS is sufficient
    cds_loc = re.compile('CDS:(\d+)-(\d+)')
    print "Reading input files..."
    full_seqs = fasta.load_fasta(args.fasta, 0)
    seqs = []
    seq_lens = []
    for seq, name in full_seqs:
        coords = cds_loc.search(name)
        #do shuffling here
        if coords:
            #there is a CDS field in the defline
            coords = map(int, coords.group(1, 2))
        else:
            print name
            continue
        utr5 = seq[:coords[0] - 1]
        cds = seq[coords[0] - 1:coords[1]]
        utr3 = seq[coords[1]:]
        seqs.append((seq, 'orig', name))
        utr5_shuffle = [
            shuf_utr5 + cds + utr3 for shuf_utr5 in shuffle_seq(utr5, args.n)
        ]
        cds_shuffle = [
            utr5 + shuf_cds + utr3 for shuf_cds in shuffle_seq(cds, args.n)
        ]
        utr3_shuffle = [
            utr5 + cds + shuf_utr3 for shuf_utr3 in shuffle_seq(utr3, args.n)
        ]
        for seq_type, group in zip(('utr5', 'cds', 'utr3'),
                                   (utr5_shuffle, cds_shuffle, utr3_shuffle)):
            for s in group:
                seqs.append((s, seq_type, name))
        seq_lens.append(map(len, [utr5, cds, utr3]))

    mRNN = model.build_model(args.weights)
    print "Evaluating sequences..."
    seqs, seq_type, name = zip(*seqs)
    cds_coords = []
    if args.e:
        names = [transpat.search(n).group() for n in name]
    else:
        names = [line.strip() for line in name]
    probs = mRNN.batch_predict(seqs)

    #delete sequences here, since they are not needed anymore
    del seqs

    logodds = logit(probs)
    Zscores = []
    #calculate z-scores
    i = 0
    while i < len(probs):
        batch = []
        curr_name = names[i]
        while curr_name == names[i]:
            batch.append((probs[i], seq_type[i]))
            i += 1
            if i == len(probs):
                break
        assert batch[0][1] == 'orig'
        orig = batch[0][0]
        tmp = {'name': curr_name, 'utr5': None, 'cds': None, 'utr3': None}
        for j in xrange(3):
            try:
                sub_batch = batch[1 + j * args.n:1 + (j + 1) * args.n]
                tmp[sub_batch[0][1]] = z_score(orig, zip(*sub_batch)[0])
            except IndexError:
                pass
        Zscores.append([tmp[key] for key in ['name', 'utr5', 'cds', 'utr3']])
    comments = '#fasta: {0}, weights: {1}, number of shuffles: {2}'.format(
        args.fasta, args.weights, args.n)
    lines = [
        comments,
        "transcript\t5' UTR Z-score\tCDS Z-score\t3' UTR Z-score\t5' UTR length\tCDS length\t3' UTR length"
    ]
    for z, lens in zip(Zscores, seq_lens):
        line = map(str, z + lens)
        lines.append('\t'.join(line))

    with open(args.output, 'w') as out:
        out.write('\n'.join(lines))

    if args.p:
        plot_zscore_scatter(Zscores, seq_lens, args.p)
        plot_zscore_histogram(Zscores, seq_lens, args.p)