Exemple #1
0
def runExonerate(input):
    s = input.split(':::')
    ProtID = s[0]
    ScaffID = s[1]
    ScaffStart = int(s[2])
    ScaffEnd = int(s[3])
    # get the protein model
    query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa')
    with open(query, 'w') as output:
        SeqIO.write(protein_dict[ProtID], output, 'fasta')
    # now get the genome region, use different variable names for SeqRecords to avoid collision
    scaffold = os.path.join(
        tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' +
        str(ScaffEnd) + '.fa')
    with open(scaffold, 'w') as output2:
        with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'),
                  'rU') as fullscaff:
            for header, Sequence in SimpleFastaParser(fullscaff):
                # grab a 3 kb cushion on either side of hit region, careful of scaffold ends
                start = ScaffStart - 3000
                if start < 1:
                    start = 1
                end = ScaffEnd + 3000
                if end > len(Sequence):
                    end = len(Sequence)
                output2.write('>%s\n%s\n' % (header, Sequence[start:end]))
    exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__'
    # check that input files are created and valid
    exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out')
    ryo = "AveragePercentIdentity: %pi\n"
    cmd = [
        'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment',
        'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron',
        str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold
    ]
    if lib.checkannotations(query) and lib.checkannotations(scaffold):
        # run exonerate, capture errors
        with open(exonerate_out, 'w') as output3:
            proc = subprocess.Popen(cmd,
                                    stdout=output3,
                                    stderr=subprocess.PIPE)
        stderr = proc.communicate()
        if 'WARNING' in stderr[1]:
            lib.log.debug('Error in input:{:}'.format(input))
            lib.log.debug(
                '%s, Len=%i, %i-%i; %i-%i' %
                (header, len(Sequence), ScaffStart, ScaffEnd, start, end))
            os.rename(query,
                      os.path.join(tmpdir, 'failed', os.path.basename(query)))
            os.rename(
                scaffold,
                os.path.join(tmpdir, 'failed', os.path.basename(scaffold)))
        else:
            for y in [query, scaffold]:
                try:
                    lib.SafeRemove(y)
                except OSError:
                    lib.log.debug("Error removing %s" % (y))
        # check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
        if lib.getSize(exonerate_out) < 500:
            os.remove(exonerate_out)
    else:
        lib.log.debug('Error in query or scaffold:{:}'.format(input))
        lib.SafeRemove(query)
        lib.SafeRemove(scaffold)
Exemple #2
0
                        cols[4] = str(int(cols[4]) + offset)
                        output.write('\t'.join(cols))

# convert to GFF3 using ExoConverter from EVM
with open(args.out, 'w') as output:
    subprocess.call([ExoConverter, exonerate_raw], stdout=output, stderr=FNULL)

# output some quick summary of exonerate alignments that you found
Found = lib.countGFFgenes(exonerate_raw)
lib.log.info('Exonerate finished: found {:,} alignments'.format(Found))

# check for saving output of tblastn
if args.tblastn_out:
    shutil.copyfile(BlastResult, args.tblastn_out)

# finally clean-up your mess if failed is empty
if args.debug:
    try:
        os.rmdir(os.path.join(tmpdir, 'failed'))
        empty = True
    except OSError:
        empty = False
    if empty:
        lib.SafeRemove(tmpdir)
    else:
        lib.log.error("Failed exonerate alignments found, see files in %s" %
                      os.path.join(tmpdir, 'failed'))
else:
    if os.path.isdir(tmpdir):
        lib.SafeRemove(tmpdir)
Exemple #3
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-mask.py',
        description='''Wrapper for RepeatModeler/RepeatMasker''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='genome assembly FASTA format')
    parser.add_argument('-o',
                        '--out',
                        required=True,
                        help='Output softmasked FASTA file')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep intermediate files')
    parser.add_argument('-m',
                        '--method',
                        default='tantan',
                        choices=['repeatmodeler', 'repeatmasker', 'tantan'],
                        help='Method to mask repeats with')
    parser.add_argument('-s',
                        '--repeatmasker_species',
                        help='RepeatMasker species, will skip repeatmodeler')
    parser.add_argument(
        '-l',
        '--repeatmodeler_lib',
        help='Pre-computed RepeatModeler (or other) repetitive elements')
    parser.add_argument('--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    args = parser.parse_args(args)

    # create log file for Repeats(capture stderr)
    log_name = 'funannotate-mask.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running funanotate v{:}".format(version))

    repeats = None
    tmpdir = None
    if args.method == 'tantan':
        programs = ['tantan']
        lib.CheckDependencies(programs)
        lib.log.info('Soft-masking simple repeats with tantan')
        runTanTan(args.input, args.out)
    else:
        programs = ['RepeatMasker']
        if args.method == 'repeatmodeler':
            programs += ['BuildDatabase', 'RepeatModeler']
        lib.CheckDependencies(programs)

        # create tmpdir
        pid = uuid.uuid4()
        tmpdir = 'mask_' + str(pid)
        os.makedirs(tmpdir)

        # parse options which dictates how repeatmodeler/masker are run
        if not args.repeatmodeler_lib:  # no fasta file given, so
            if not args.repeatmasker_species:  # no species given, so run entire repeatmodler + repeat masker
                repeats = 'repeatmodeler-library.' + str(pid) + '.fasta'
                RepeatModelMask(args.input, args.cpus, tmpdir, args.out,
                                repeats, args.repeatmasker_species, log_name)
            else:
                RepeatMaskSpecies(args.input, args.repeatmasker_species,
                                  args.cpus, tmpdir, args.out, log_name)
        else:
            if lib.checkannotations(args.repeatmodeler_lib):
                RepeatMask(args.input, args.repeatmodeler_lib, args.cpus,
                           tmpdir, args.out, log_name)
            else:
                lib.log.error(
                    'ERROR: repeat library is not a valid file: {:}'.format(
                        args.repeatmodeler_lib))
                sys.exit(1)

    # output some stats on %reads masked.
    scaffolds = 0
    maskedSize = 0
    GenomeLength = 0
    with open(args.out, 'r') as input:
        for rec, Seq in SimpleFastaParser(input):
            scaffolds += 1
            GenomeLength += len(Seq)
            maskedSize += lib.n_lower_chars(Seq)

    percentMask = maskedSize / float(GenomeLength)
    lib.log.info(
        'Repeat soft-masking finished: \nMasked genome: {:}\nnum scaffolds: {:,}\nassembly size: {:,} bp\nmasked repeats: {:,} bp ({:.2f}%)'
        .format(os.path.abspath(args.out), scaffolds, GenomeLength, maskedSize,
                percentMask * 100))
    if repeats:
        lib.log.info('RepeatModeler library: {:}'.format(repeats))
    # clean up
    if not args.debug:
        if tmpdir:
            lib.SafeRemove(tmpdir)
    print("-------------------------------------------------------")
Exemple #4
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='gbk2parts.py',
        description='''Script to convert GBK file to its components.''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--tbl',
                        required=True,
                        help='Genome annotation in tbl format')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        help='Genome in FASTA format')
    parser.add_argument(
        '-s',
        '--species',
        required=True,
        help=
        'Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space'
    )
    parser.add_argument('--isolate', help='Isolate name (e.g. Af293)')
    parser.add_argument('--strain', help='Strain name (e.g. CEA10)')
    parser.add_argument(
        '-t',
        '--tbl2asn',
        help='Custom parameters for tbl2asn, example: linkage and gap info')
    parser.add_argument('--sbt', help='tbl2asn template file')
    parser.add_argument('-o', '--output', help='Output basename')
    args = parser.parse_args(args)

    parentdir = os.path.dirname(lib.__file__)

    # see if organism/species/isolate was passed at command line
    organism = None
    if args.species:
        organism = args.species
    else:
        organism = os.path.basename(args.tbl).split('.t')[0]
    if args.strain:
        organism_name = organism + '_' + args.strain
    elif args.isolate:
        organism_name = organism + '_' + args.isolate
    else:
        organism_name = organism
    organism_name = organism_name.replace(' ', '_')
    if args.output:
        outputname = args.output
    else:
        outputname = organism_name

    # create tmp folder to run tbl2asn from
    # make tmp folder
    tmp = outputname + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    # now move files into proper location
    if not lib.checkannotations(args.fasta):
        print(('FASTA genome file not found: {:}'.format(args.fasta)))
        sys.exit(1)
    if not lib.checkannotations(args.tbl):
        print(('TBL annotations file not found: {:}'.format(args.tbl)))
        sys.exit(1)
    shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa'))
    shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl'))

    # now we can run tbl2asn
    if args.sbt:
        SBT = args.sbt
    else:
        SBT = os.path.join(parentdir, 'config', 'test.sbt')
    discrep = outputname + '.discrepency.txt'
    version = 1
    runtbl2asn(tmp, SBT, discrep, organism, args.isolate, args.strain,
               args.tbl2asn, version)

    # check the output for errors for NCBI
    final_fixes = os.path.join(tmp, 'models-need-fixing.txt')
    prefix = locustagGB(os.path.join(tmp, 'genome.gbf'))
    errors = ncbiCheckErrors(os.path.join(tmp, 'errorsummary.val'),
                             os.path.join(tmp, 'genome.val'), prefix,
                             final_fixes)

    # get output files
    gbkout = outputname + '.gbk'
    shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout)
    if errors < 1:
        lib.SafeRemove(tmp)
def runtbl2asn_parallel(folder, template, discrepency, organism, isolate,
                        strain, parameters, version, cpus):
    '''
    function to run NCBI tbl2asn
    '''
    # make sure ouput that will be appended to is not there
    for file in [
            os.path.join(folder, 'genome.val'),
            os.path.join(folder, 'errorsummary.val'),
            os.path.join(folder, 'genome.gbf'), discrepency
    ]:
        lib.SafeRemove(file)
    # get funannotate version
    fun_version = lib.get_version()
    # input should be a folder
    if not os.path.isdir(folder):
        lib.log.error("tbl2asn error: %s is not a directory, exiting" % folder)
        sys.exit(1)
    # based on organism, isolate, strain, construct meta info for -j flag
    if not organism:
        lib.log.error("tbl2asn error: organism not specified")
        sys.exit(1)
    meta = "[organism=" + organism + "]"
    if isolate:
        isolate_meta = "[isolate=" + isolate + "]"
        meta = meta + " " + isolate_meta
    if strain:
        strain_meta = "[strain=" + strain + "]"
        meta = meta + " " + strain_meta
    cmd = [
        'tbl2asn', '-y', '"Annotated using ' + fun_version + '"', '-N',
        str(version), '-t', template, '-M', 'n', '-j', '"' + meta + '"', '-V',
        'b', '-c', 'f', '-T', '-a', 'r10u'
    ]
    # check for custom parameters
    if parameters:
        params = parameters.split(' ')
        cmd = cmd + params
    # check for folders in the input folder, if present, run tbl2asn on each folder and then combine
    multiple = []
    for file in os.listdir(folder):
        if os.path.isdir(os.path.join(folder, file)):
            multiple.append(os.path.join(folder, file))
    if len(multiple) == 0:
        multiple.append(folder)
    p = multiprocessing.Pool(cpus)
    results = []
    for i in multiple:
        results.append(p.apply_async(tbl2asn_safe_run, (cmd, i)))
    p.close()
    p.join()
    # now collect the results make in main folder
    # first delete any of the outputs you might be appending to
    with open(os.path.join(folder, 'genome.val'), 'a') as validation:
        with open(discrepency, 'a') as discrep:
            with open(os.path.join(folder, 'errorsummary.val'),
                      'a') as summary:
                with open(os.path.join(folder, 'genome.gbf'), 'a') as genbank:
                    for dirName, subdirList, fileList in os.walk(
                            folder, topdown=False):
                        if len(subdirList) > 0:
                            continue
                        for f in fileList:
                            if f == 'errorsummary.val':
                                with open(os.path.join(dirName, f)) as infile:
                                    summary.write(infile.read())
                            elif f.endswith('.val'):
                                with open(os.path.join(dirName, f)) as infile:
                                    validation.write(infile.read())
                            elif f.endswith('.gbf'):
                                with open(os.path.join(dirName, f)) as infile:
                                    genbank.write(infile.read())
                            elif f.endswith('.tbl'):
                                shutil.copyfile(os.path.join(dirName, f),
                                                os.path.join(folder, f))
                            elif f.endswith('.sqn'):
                                shutil.copyfile(os.path.join(dirName, f),
                                                os.path.join(folder, f))
                            elif f == 'discrepency.report.txt':
                                with open(os.path.join(dirName, f)) as infile:
                                    discrep.write(infile.read())
def split_tbl2asn(folder):
    '''
    function to chunk the genome and annotation files into parts if > 10,000 contigs to
    conform to NCBI recommendations and avoid the 2GB threshold of sequin files
    '''
    numSeqs = 0
    genomeSize = 0
    with open(os.path.join(folder, 'genome.fsa'), 'r') as fastain:
        for Header, Seq in SimpleFastaParser(fastain):
            numSeqs += 1
            genomeSize += len(Seq)
    # if less than 10,000 contigs and less than 100 MB, then don't split and just run it
    if numSeqs < 10000 and genomeSize < int(100e6):
        # move to subfolder for multiprocessing to work correctly
        if os.path.isdir(os.path.join(folder, '1')):
            lib.SafeRemove(os.path.join(folder, '1'))
        os.makedirs(os.path.join(folder, '1'))
        shutil.copyfile(os.path.join(folder, 'genome.fsa'),
                        os.path.join(folder, '1', 'genome.fsa'))
        shutil.copyfile(os.path.join(folder, 'genome.tbl'),
                        os.path.join(folder, '1', 'genome.tbl'))
    else:
        # rounded_up = -(-numerator // denominator) #nice trick to round up
        if genomeSize > int(100e6):
            chunks = -(-genomeSize // int(100e6))  # split into 100 MB chunks
        else:
            chunks = -(-numSeqs // 10000)
        Records = []
        with open(os.path.join(folder, 'genome.fsa'), 'r') as fastain:
            for tup in SimpleFastaParser(fastain):
                Records.append(tup)
        # sort the fasta tuples by size
        Records = sorted(Records, key=lambda x: len(x[1]), reverse=True)
        # shuffle them into lists like dealing playing cards then all chunks have similar sizes
        sliced_records = list_slice(Records, chunks)
        # loop through and add headers to dictionary for tbl splitting lookup
        headers = {}
        for i, x in enumerate(sliced_records):
            if os.path.isdir(os.path.join(folder, str(i + 1))):
                lib.SafeRemove(os.path.join(folder, str(i + 1)))
            os.makedirs(os.path.join(folder, str(i + 1)))
            with open(
                    os.path.join(folder, str(i + 1),
                                 'genome' + str(i + 1) + '.fsa'),
                    'w') as outfile:
                for seq in x:
                    outfile.write('>{:}\n{:}\n'.format(seq[0], seq[1]))
                    headers[seq[0]] = i + 1
        # now parse tbl file and split in same way as fasta files
        with open(os.path.join(folder, 'genome.tbl'), 'r') as tblin:
            for contig in lib.readBlocks(tblin, '>Feature'):
                ID = contig[0].split(' ')[-1].rstrip()
                filenum = None
                if ID in headers:
                    filenum = headers.get(ID)
                if filenum:
                    with open(
                            os.path.join(folder, str(filenum),
                                         'genome' + str(filenum) + '.tbl'),
                            'a') as tblout:
                        tblout.write(''.join(contig))