else:
            single_norm = None

#setup reads and check if normalization worked
norm_reads = (left_norm, right_norm, single_norm)
lib.log.debug(norm_reads)
for read in norm_reads:
    if read:
        if not os.path.isfile(read):
            lib.log.error("Read normalization failed, %s does not exist." %
                          read)
            sys.exit(1)

#now run Trinity with trimmomatic and read normalization
trinity_transcripts = os.path.join(tmpdir, 'trinity.fasta')
if not lib.checkannotations(trinity_transcripts):
    if args.trinity:
        shutil.copyfile(os.path.abspath(args.trinity), trinity_transcripts)
    else:
        #run trinity genome guided
        runTrinityGG(genome, norm_reads, trinity_transcripts)
else:
    lib.log.info(
        "Existing Trinity results found: {:}".format(trinity_transcripts))

#clip polyA tails
#polyAclip(trinity_tmp, trinity_transcripts)
#run SeqClean to clip polyA tails and remove low quality seqs.
cleanTranscripts = os.path.join(tmpdir, 'trinity.fasta.clean')
runSeqClean(trinity_transcripts, tmpdir)
Exemple #2
0
def runExonerate(input):
    s = input.split(':::')
    ProtID = s[0]
    ScaffID = s[1]
    ScaffStart = int(s[2])
    ScaffEnd = int(s[3])
    #get the protein model
    query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa')
    with open(query, 'w') as output:
        SeqIO.write(protein_dict[ProtID], output, 'fasta')
    #now get the genome region, use different variable names for SeqRecords to avoid collision
    scaffold = os.path.join(
        tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' +
        str(ScaffEnd) + '.fa')
    with open(scaffold, 'w') as output2:
        with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'),
                  'rU') as fullscaff:
            for header, Sequence in SimpleFastaParser(fullscaff):
                #grab a 3 kb cushion on either side of hit region, careful of scaffold ends
                start = ScaffStart - 3000
                if start < 1:
                    start = 1
                end = ScaffEnd + 3000
                if end > len(Sequence):
                    end = len(Sequence)
                output2.write('>%s\n%s\n' % (header, Sequence[start:end]))
    exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__'
    #check that input files are created and valid
    exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out')
    ryo = "AveragePercentIdentity: %pi\n"
    cmd = [
        'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment',
        'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron',
        str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold
    ]
    if lib.checkannotations(query) and lib.checkannotations(scaffold):
        #run exonerate, capture errors
        with open(exonerate_out, 'w') as output3:
            proc = subprocess.Popen(cmd,
                                    stdout=output3,
                                    stderr=subprocess.PIPE)
        stderr = proc.communicate()
        if 'WARNING' in stderr[1]:
            lib.log.debug('Error in input:{:}'.format(input))
            lib.log.debug(
                '%s, Len=%i, %i-%i; %i-%i' %
                (header, len(Sequence), ScaffStart, ScaffEnd, start, end))
            os.rename(query,
                      os.path.join(tmpdir, 'failed', os.path.basename(query)))
            os.rename(
                scaffold,
                os.path.join(tmpdir, 'failed', os.path.basename(scaffold)))
        else:
            for y in [query, scaffold]:
                try:
                    lib.SafeRemove(y)
                except OSError:
                    lib.log.debug("Error removing %s" % (y))
        #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
        if lib.getSize(exonerate_out) < 500:
            os.remove(exonerate_out)
    else:
        lib.log.debug('Error in query or scaffold:{:}'.format(input))
        lib.SafeRemove(query)
        lib.SafeRemove(scaffold)
Exemple #3
0
#make sure logfiles directory is present, will need later
if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
    os.makedirs(os.path.join(outputdir, 'logfiles'))

#get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here
Proteins = os.path.abspath(Proteins)
genbank = os.path.abspath(genbank)

if 'phobius' in args.methods or 'all' in args.methods:
    #run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote
    phobius_out = os.path.join(outputdir, 'annotate_misc',
                               'phobius.results.txt')
    phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log')
    lib.log.info(
        "Predicting secreted and transmembrane proteins using Phobius")
    if not lib.checkannotations(phobius_out):
        if args.email:
            subprocess.call([
                os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i',
                Proteins, '-o', phobius_out, '-e',
                str(args.email), '-l', phobiusLog
            ])
        else:
            subprocess.call([
                os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i',
                Proteins, '-o', phobius_out, '-l', phobiusLog
            ])

if 'interproscan' in args.methods or 'all' in args.methods:
    IPRCombined = os.path.join(outputdir, 'annotate_misc', 'iprscan.xml')
    #run interpro scan
else:
    organism = args.species
    if not args.isolate:
        isolate = '???'
    else:
        isolate = args.isolate

############################################################################
#start workflow here
ProtCount = lib.countfasta(Proteins)
lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')

#run PFAM-A search
lib.log.info("Running HMMer search of PFAM domains")
pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt')
if not lib.checkannotations(pfam_results):
    lib.PFAMsearch(Proteins, args.cpus, 1e-50,
                   os.path.join(outputdir, 'annotate_misc'), pfam_results)
num_annotations = lib.line_count(pfam_results)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run SwissProt Blast search
lib.log.info("Running Blastp search of UniProt DB")
blast_out = os.path.join(outputdir, 'annotate_misc',
                         'annotations.swissprot.txt')
if not lib.checkannotations(blast_out):
    lib.SwissProtBlast(Proteins, args.cpus, 1e-5,
                       os.path.join(outputdir, 'annotate_misc'), blast_out)
num_annotations = lib.line_count(blast_out)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run MEROPS Blast search
lib.log.info("Running Blastp search of MEROPS protease DB")
Exemple #5
0
else:
    organism_name = organism
organism_name = organism_name.replace(' ', '_')
if args.output:
    outputname = args.output
else:
    outputname = organism_name

#create tmp folder to run tbl2asn from
#make tmp folder
tmp = outputname + '_tmp'
if not os.path.exists(tmp):
    os.makedirs(tmp)

#now move files into proper location
if not lib.checkannotations(args.fasta):
    print('FASTA genome file not found: {:}'.format(args.fasta))
    sys.exit(1)
if not lib.checkannotations(args.tbl):
    print('TBL annotations file not found: {:}'.format(args.tbl))
    sys.exit(1)
shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa'))
shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl'))

#now we can run tbl2asn
if args.sbt:
    SBT = args.sbt
else:
    SBT = os.path.join(parentdir, 'lib', 'test.sbt')
discrep = outputname + '.discrepency.txt'
version = 1
if args.cpus > len(scaffolds):
    num = len(scaffolds)
else:
    num = args.cpus
lib.log.debug("Running Augustus on %i chunks, using %i CPUs" %
              (len(scaffolds), num))
lib.runMultiProgress(runAugustus, scaffolds, num)

lib.log.debug("Augustus prediction is finished, now concatenating results")
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
    for file in scaffolds:
        file = os.path.join(tmpdir, file + '.augustus.gff3')
        with open(file) as input:
            output.write(input.read())

if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')):
    lib.log.debug('Augustus finished, now joining results')
if lib.which_path('join_aug_pred.pl'):
    join_script = 'join_aug_pred.pl'
else:
    join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')

cmd = '{:} < {:} > {:}'.format(join_script,
                               os.path.join(tmpdir, 'augustus_all.gff3'),
                               args.out)
lib.log.debug(cmd)

with open(args.out, 'w') as finalout:
    with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as infile:
        subprocess.call([join_script], stdin=infile, stdout=finalout)
Exemple #7
0
        for file in os.listdir(go_folder):
            if not file.startswith('associations'):
                file = os.path.join(go_folder, file)
                with open(file) as input:
                    pop.write(input.read())

    #now loop through each genome comparing to population
    for f in os.listdir(go_folder):
        if f.startswith('associations'):
            continue
        if f.startswith('population'):
            continue
        file = os.path.join(go_folder, f)
        base = f.replace('.txt', '')
        goa_out = os.path.join(args.out, 'go_enrichment', base+'.go.enrichment.txt')
        if not lib.checkannotations(goa_out):
            cmd = ['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')]
            lib.runSubprocess2(cmd, '.', lib.log, goa_out)

    #load into pandas and write to html
    with open(os.path.join(args.out, 'go.html'), 'w') as output:
        pd.set_option('display.max_colwidth', -1)
        pd.options.mode.chained_assignment = None #turn off warning
        output.write(lib.HEADER)
        output.write(lib.GO)
        for f in os.listdir(os.path.join(args.out, 'go_enrichment')):
            if f.endswith('go.enrichment.txt'):
                file = os.path.join(args.out, 'go_enrichment', f)
                base = os.path.basename(file)
                name = base.split('.go_enrichment.txt')[0]
                #check goatools output, return is a tuple with True/False and header line #
Exemple #8
0
if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)):
    lib.download_buscos(args.busco_db)

ProtCount = lib.countfasta(args.input)
lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')  

#convert to proteins and screen with busco
lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
BUSCODB = os.path.join(parentdir, 'DB', args.busco_db)
BUSCO = os.path.join(parentdir, 'util', 'funannotate-BUSCO2.py')
cmd = [sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f']
lib.runSubprocess(cmd, '.', lib.log)

#check that it ran correctly
busco_results = os.path.join('run_'+species, 'full_table_'+species+'.tsv')
if not lib.checkannotations(busco_results):
    lib.log.error("BUSCO failed, check logfile")
    sys.exit(1)
nameChange = {}
with open(busco_results, 'rU') as input:
    for line in input:
        if line.startswith('#'):
            continue
        cols = line.split('\t')
        if cols[1] == 'Complete':
            if not cols[2] in nameChange:
                nameChange[cols[2]] = cols[0]
            else:
                lib.log.error("Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0]))
                del nameChange[cols[2]]
        strain = '???'
    else:
        strain = args.strain

############################################################################
#start workflow here
ProtCount = lib.countfasta(Proteins)
lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')
if ProtCount < 1:
    lib.log.error("There are no gene models in this genbank file")
    sys.exit(1)
 
#run PFAM-A search
lib.log.info("Running HMMer search of PFAM domains")
pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt')
if not lib.checkannotations(pfam_results):
    lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results)
num_annotations = lib.line_count(pfam_results)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')

#run SwissProt Blast search
lib.log.info("Running Blastp search of UniProt DB")
blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt')
if not lib.checkannotations(blast_out):
    lib.SwissProtBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out)
num_annotations = lib.line_count(blast_out)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')

#run MEROPS Blast search
lib.log.info("Running Blastp search of MEROPS protease DB")
blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.merops.txt')
Exemple #10
0
#create tmpdir
pid = os.getpid()
tmpdir = 'mask_' + str(pid)
os.makedirs(tmpdir)
repeats = None
#parse options which dictates how repeatmodeler/masker are run
if not args.repeatmodeler_lib:  #no fasta file given, so
    if not args.repeatmasker_species:  #no species given, so run entire repeatmodler + repeat masker
        repeats = 'repeatmodeler-library.' + str(pid) + '.fasta'
        lib.RepeatModelMask(args.input, args.cpus, tmpdir, args.out, repeats,
                            log_name)
    else:
        lib.RepeatMaskSpecies(args.input, args.repeatmasker_species, args.cpus,
                              tmpdir, args.out, log_name)
else:
    if lib.checkannotations(args.repeatmodeler_lib):
        lib.RepeatMask(args.input, args.repeatmodeler_lib, args.cpus, tmpdir,
                       args.out, log_name)
    else:
        lib.log.error('ERROR: repeat library is not a valid file: {:}'.format(
            args.repeatmodeler_lib))
        sys.exit(1)

#output some stats on %reads masked.
scaffolds = 0
maskedSize = 0
GenomeLength = 0
with open(args.out, 'rU') as input:
    for rec, Seq in SimpleFastaParser(input):
        scaffolds += 1
        GenomeLength += len(Seq)
else:
    organism = args.species
    if not args.isolate:
        isolate = '???'
    else:
        isolate = args.isolate

############################################################################
#start workflow here
ProtCount = lib.countfasta(Proteins)
lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')  
 
#run PFAM-A search
lib.log.info("Running HMMer search of PFAM domains")
pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt')
if not lib.checkannotations(pfam_results):
    lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results)
num_annotations = lib.line_count(pfam_results)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run SwissProt Blast search
lib.log.info("Running Blastp search of UniProt DB")
blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt')
if not lib.checkannotations(blast_out):
    lib.SwissProtBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out)
num_annotations = lib.line_count(blast_out)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run MEROPS Blast search
lib.log.info("Running Blastp search of MEROPS protease DB")
blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.merops.txt')
if not lib.checkannotations(blast_out):
    lib.MEROPSBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out)
            single_norm = None

#setup reads and check if normalization worked
norm_reads = (left_norm, right_norm, single_norm)
lib.log.debug(norm_reads)
for read in norm_reads:
    if read:
        if not os.path.isfile(read):
            lib.log.error("Read normalization failed, %s does not exist." %
                          read)
            sys.exit(1)

#now run Trinity with trimmomatic and read normalization
trinity_transcripts = os.path.join(tmpdir, 'trinity.fasta')
trinity_tmp = os.path.join(tmpdir, 'trinity.tmp')
if not lib.checkannotations(trinity_tmp):
    if args.trinity:
        shutil.copyfile(os.path.abspath(args.trinity), trinity_tmp)
    else:
        #run trinity genome guided
        runTrinityGG(genome, norm_reads, trinity_tmp)
else:
    lib.log.info("Existing Trinity results found {:}".format(trinity_tmp))

#clip polyA tails
polyAclip(trinity_tmp, trinity_transcripts)

if not lib.checkannotations(trinity_tmp):
    lib.log.error("TRINITY step failed, check logfile, exiting")
    sys.exit(1)