Example #1
0
def splitfastas(infile, fastadir, filepathinfo, seqlengthfile):
    """takes input fastafile from filepath or sys.stdin; splits by sample and writes to outdir; also writes filepathinfo.tsv to record sample, fastafilepath, and blastdbfilepath"""
    from Bio import SeqIO
    import re, os
    from pythonmods import runsubprocess
    #first create splitfasta output directory
    runsubprocess(['mkdir -p %s' % fastadir], shell=True)
    #parse fasta and store in recorddict
    recorddict = {}
    for recordid, recordseq in SeqIO.FastaIO.SimpleFastaParser(infile):
        #remove description from fasta id if present
        newfastaheader = re.sub(r'(\S+)(?: .*)?', r'\1', recordid)
        newfastaheader = newfastaheader.strip()
        recordid = newfastaheader
        #get sample name (fasta header format should be sample or sample|contig)
        sample = re.match(r'^([^\|]*).*', newfastaheader)
        sample = sample.group(1)
        #write to dict
        if sample not in recorddict:
            recorddict[sample] = []
        recorddict[sample].append((recordid, recordseq))
    infile.close()
    #write records to splitfastas directory, split by sample; write seqlengths to seqlengths.tsv
    f2 = open(filepathinfo, 'w')
    f3 = open(seqlengthfile, 'w')
    samples = set()
    for sample in recorddict.keys():
        samples.add(sample)
        fastafilepath = '%s/%s.fasta' % (fastadir, sample)
        blastdbfilepath = os.path.splitext(fastafilepath)[0]
        blastdbfilepath = '%s_db' % blastdbfilepath
        f2.write('%s\t%s\t%s\n' % (sample, fastafilepath, blastdbfilepath))
        with open(fastafilepath, 'w') as output_handle:
            for recordid, recordseq in recorddict[sample]:
                f3.write('%s\t%s\n' % (recordid, len(recordseq)))
                output_handle.write(">%s\n%s\n" % (recordid, recordseq))
    f2.close()
    f3.close()
    assert len(
        samples) > 0, 'Error: no records detected from fasta file provided'
Example #2
0
            if attribute not in attributes:
                sys.exit(
                    'Harmonized attribute name: %s is invalid (not listed in attributenames.tsv)'
                    % attribute)

###run Edirect commands

if args.accessiontype == 'nucleotide':
    print('retrieving nucleotide accession metadata from NCBI')

    accessionsdf = pd.read_csv('%s' % str(args.accessions),
                               header=None,
                               sep='\t')
    accessions = accessionsdf.iloc[:, 0].tolist()

    runsubprocess(['mkdir -p %s' % outputpath], shell=True)
    f = open('%s/nucleotidemetadata.tsv' % outputpath, 'w')
    f.write(
        'Accession\tCreateDate\tUpdateDate\tMoleculeType\tLength\tCompleteness\tSourceGenomeType\tSourceTaxon\tSourceTaxID\tAssemblyMethod\tGenomeCoverage\tSequencingTechnology\tAnnotationPipeline\tAnnotationMethod\tBioprojectAccession\tBiosampleAccession\tSRAAccession\tAssemblyAccession\tPubMedID\n'
    )
    f.close()
    f = open('%s/missingaccessions.txt' % outputpath, 'w')
    f.close()

    accessionslen = len(accessions)
    chunklen = int(args.batchsize)

    runsubprocess([
        'econtact -email %s -tool nucleotidemetadatadownload' %
        str(args.emailaddress)
    ],
Example #3
0
import sys, os, re
from pythonmods import runsubprocess

dirpath = sys.argv[1]  #args.sequences directory path
filepathinfo = sys.argv[2]
blastdbdir = sys.argv[3]  #actually where blastdbs are stored
blasttype = sys.argv[4]

runsubprocess(['mkdir -p %s' % blastdbdir], shell=True)

directory = str(dirpath).rstrip('/')
dircontents = os.listdir(directory)

samples = set()
f2 = open(filepathinfo, 'w')
for dircontent in dircontents:
    filepath = '%s/%s' % (directory, dircontent)
    if os.path.isfile(filepath):  #check for fasta files...
        if filepath.endswith('.gz'):
            gunzipfilepath = re.sub(r'\.gz$', '', filepath)
            extension = os.path.splitext(gunzipfilepath)[1]
            sample = os.path.splitext(os.path.basename(gunzipfilepath))[0]
        else:
            extension = os.path.splitext(filepath)[1]
            sample = os.path.splitext(os.path.basename(filepath))[0]
        if extension in {'.fa', '.fasta', '.fna'}:
            if sample not in samples:  #skip duplicates e.g. sample.fa and sample.fa.gz
                samples.add(sample)
                blastdbpath = '%s/%s_db' % (blastdbdir, sample)
                f2.write('%s\t%s\t%s\n' % (sample, filepath, blastdbpath))
f2.close()
Example #4
0
    inclusionpresent = 'inclusionabsent'
    inclusionarg = 'placeholder'
elif args.annotationtxt_inclusion != None:
    inclusionpresent = 'commandline'
    inclusionarg = str(','.join(args.annotationtxt_inclusion))
else:
    inclusionpresent = 'filepath'
    inclusionarg = args.annotationtxt_inclusion_file
    if os.path.isfile(inclusionarg) == False:
        print('Error: %s is not a valid filepath' % inclusionarg)
        sys.exit()

#handle filepaths to directory
args.inputdir = str(args.inputdir).rstrip('/')

runsubprocess(['mkdir -p %s' % outputpath], shell=True)

if args.features == None:
    runsubprocess([
        'Rscript',
        '%s/genoplotr.R' % sourcedir,
        str(args.inputdir),
        str(','.join(args.syntax)),
        str(args.sequencelengths),
        str(args.comparisons),
        str(args.seg_plots), outputpath,
        str(args.comparisontype),
        str(args.main),
        str(args.main_pos),
        str(';'.join(args.sequencefills)),
        str(';'.join(args.sequenceoutlines)),
Example #5
0


noblasthits=False

if os.path.exists(outputpath):
    sys.exit('Error: %s output directory already exists, delete directory and try again'%outputpath)

if args.sequences!=None:
    blasttype='allvallpairwise'
    blastdbdir='%s/blastdbs'%outputpath
    filepathinfo='%s/filepathinfo.tsv'%outputpath
    subjectsamples='%s/allsubjects.txt'%outputpath
    if fastafileinput=='file' or fastafileinput=='stdin':
        splitfastas(args.sequences,blastdbdir,filepathinfo,'%s/seqlengths.tsv'%outputpath)
        runsubprocess(['bash','%s/makeblastdbs.sh'%sourcedir,filepathinfo,str(args.threads),sourcedir])
        laterruntime=runtime()
        #print(laterruntime-startruntime, 'runtime; finished creating blast databases')
        print('finished creating blast databases')
        runsubprocess(['bash','%s/runblast.sh'%sourcedir,outputpath, blastdbdir, filepathinfo, str(args.evalue), str(args.wordsize), str(args.task),str(args.cullinglimit),str(args.threads),str(args.bidirectionalblast),blasttype],preexec_fn='sigpipefix')
        laterruntime=runtime()
        #print(laterruntime-startruntime, 'runtime; finished running blast')
        print('finished running blast')
    else:
        runsubprocess(['python','%s/getdirpaths.py'%sourcedir,args.sequences,filepathinfo,blastdbdir,blasttype])
        runsubprocess(['python','%s/getseqlengths.py'%sourcedir,'%s/seqlengths.tsv'%outputpath,filepathinfo])
        runsubprocess(['bash','%s/makeblastdbs_editfastas.sh'%sourcedir,filepathinfo,str(args.threads),sourcedir])
        laterruntime=runtime()
        #print(laterruntime-startruntime, 'runtime; finished creating blast databases')
        print('finished creating blast databases')
        runsubprocess(['bash','%s/runblast_dirinput.sh'%sourcedir,outputpath,sourcedir, filepathinfo, filepathinfo,str(args.evalue), str(args.wordsize), str(args.task),str(args.cullinglimit),str(args.threads),str(args.bidirectionalblast),blasttype],preexec_fn='sigpipefix')
Example #6
0
#!/usr/bin/env python
import os, datetime
from Bio import SeqIO
from pythonmods import runsubprocess

sourcedir = os.path.dirname(os.path.abspath(__file__))

output_folder = './databases/plasmidfinder_db'
cmdArgs = ['mkdir -p %s' % output_folder]
runsubprocess(cmdArgs, shell=True)

cmdArgs = [
    'git clone https://bitbucket.org/genomicepidemiology/plasmidfinder_db.git ./databases/plasmidfinder_db'
]
runsubprocess(cmdArgs, shell=True)

print('Retrieved plasmidfinder_db from bitbucket')

gramposfastas = []
for filename in os.listdir('./databases/plasmidfinder_db'):
    if filename.endswith('.fsa') and filename != 'enterobacteriaceae.fsa':
        gramposfastas.append(filename)

#combine gram-positive replicons into single gram-positive database
f2 = open('./databases/plasmidfinder_db/gram_positive.fsa', 'w')
for filename in gramposfastas:
    with open(os.path.join(output_folder, filename)) as f:
        for indx, seq_record in enumerate(SeqIO.parse(f, 'fasta')):
            fastaheader = str(seq_record.id)
            newfastaheader = '%s|%s' % (filename.rstrip('.fsa'), fastaheader)
            seq_record.id = newfastaheader
Example #7
0
parser = argparse.ArgumentParser(description="ATCG: Alignment Based Tool for Comparative Genomics; get feature annotation files in correct format for visualisation.py",add_help=False)
parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.')
parser.add_argument('-t', '--annotationtype', help='The type of annotation file that requires conversion to correct format (required)',choices=['prokka','genbank'],type=str,required=True)
parser.add_argument('-i', '--inputpath', help='The input directory (containing annotation files) or annotation file to be converted to correct format (required)',required=True)
parser.add_argument('-o', '--outdir', help='The output directory (required)',required=True)
parser.add_argument('-s', '--seqnames', help='A file containing the sequence names associated with the annotation file(s) in the first column (required if annotationtype is prokka)',required=False)

args = parser.parse_args()
outputpath=os.path.relpath(args.outdir, cwdir)

if args.seqnames==None:
    if args.annotationtype=='prokka':
        print('Error: if using prokka annotation file(s) as input, a file containing the associated sequence names (the original names, with no changes introduced by prokka) must be provided')
        sys.exit()

runsubprocess(['mkdir -p %s'%outputpath],shell=True)

#check if input is file or directory 
if os.path.isfile(args.inputpath):
    inputpathtype='file'                                                                                 
elif os.path.isdir(args.inputpath):
    inputpathtype='directory'
else:
    print('Error: %s is not a file or directory'%args.inputpath)                                                                                              
    sys.exit()

if args.annotationtype=='prokka':
    if inputpathtype=='directory':
        runsubprocess(['bash %s/concatenateprokka.sh %s | python %s/fixprokkagff.py %s %s %s'%(sourcedir,str(args.inputpath),sourcedir,str(args.seqnames),outputpath,inputpathtype)],shell=True)
    else:
        runsubprocess(['python', '%s/fixprokkagff.py'%sourcedir, str(args.seqnames),outputpath,inputpathtype,str(args.inputpath)])
Example #8
0
parser.add_argument('-b','--besthits', help='Text file containing best hits or reciprocal best hits', required=False)
parser.add_argument('-o','--out', help='Output directory (required)', required=True)
parser.add_argument('-e','--evalue', help='BLAST e-value cutoff (default: 1e-6)', default=1e-6, type=float)
parser.add_argument('-i','--pident', help='BLAST percent identity cutoff (default: 40)', default=40, type=int)
parser.add_argument('-c','--qcovhsp', help='BLAST hsp query coverage cutoff (default: 80)', default=80, type=int)
parser.add_argument('-t','--threads', help='Number of threads to use (default: 1)', default=1, type=int)
parser.add_argument('--breakpoint', action='store_true', help='Calculate breakpoint distance statistics (default: do not calculate unless --besthits file is provided)')
args = parser.parse_args()
outputpath=os.path.relpath(args.out, cwdir)


if args.sequences==None and args.besthits==None:
    parser.error('as input, you must either provide --sequences or --besthits')

if args.sequences!=None:
    runsubprocess(['python','%s/getproteins.py'%sourcedir,outputpath, str(args.sequences)])  
    runsubprocess(['bash','%s/makeblastdbs.sh'%sourcedir,outputpath, str(args.threads), sourcedir])
    runsubprocess(['bash','%s/runblast.sh'%sourcedir,outputpath, str(args.evalue),str(args.threads)])
    runsubprocess(['bash','%s/reformatblast.sh'%sourcedir,outputpath,str(args.pident),str(args.qcovhsp)])
    runsubprocess(['Rscript','%s/getreciprocalhits.R'%sourcedir,outputpath])

    if args.breakpoint==True:
        rbhinput='metamorth'
        runsubprocess(['Rscript','%s/getbreakpointdistance.R'%sourcedir,outputpath,str(args.threads),rbhinput])

else:
    rbhinput='userprovided'
    runsubprocess(['mkdir -p %s/blast'%outputpath],shell=True)
    runsubprocess(['mkdir -p %s/output'%outputpath],shell=True)
    runsubprocess(['Rscript','%s/getbreakpointdistance.R'%sourcedir,outputpath,str(args.threads),rbhinput,str(args.besthits)])
Example #9
0
    elif rmlstdbexists == False:
        sys.exit(
            'Error: the rMLST database must be installed first (see README)')
    else:
        sys.exit(
            'Error: the PlasmidFinder database must be installed first (see README)'
        )

#check --sampleoutput flag used correctly if provided
#if args.sampleoutput==True and args.contigsamples==None:
#    sys.exit('Error: --sampleoutput is only possible if the --contigsamples flag is provided, to specify sample groupings')
if args.contigsamples != None:
    args.sampleoutput = True  #always produce sample-level output if args.contigsamples is provided

cmdArgs = ['mkdir -p %s' % outputpath]
runsubprocess(cmdArgs, shell=True)

###retrieve accessions and sequences from NCBI
if args.inhousesequences == None and args.restartwithsequences == False:
    if args.accessions == None:
        if args.datequery == None:
            datepresent = "absent"
        else:
            datepresent == "present"
        runsubprocess([
            'bash',
            '%s/downloadaccessions.sh' % sourcedir, datepresent,
            str(args.taxonomyquery),
            str(args.datequery),
            str(args.dbsource), outputpath
        ])