coverFiles = [] for sequence in sequences: if sequence != '': coverFiles.append((inPath+sequence+'/'+sequence+'_CoverDepthMatrix.txt')) output = "replicon__gene" outCover = outPath + refName + "_CoverMatrix.csv" outDepth = outPath + refName + "_DepthMatrix.csv" geneList = [] coverList = [] depthList = [] first_file = True for file in coverFiles: (prefix, name, ext) = splitPath(file) output = output + "," + name[:-17] coverFile = open(file) count = 0 for line in coverFile: if line.find('\n') != -1: line = line[:-1] cover = line.split(',') if first_file == True: geneList.append(cover[0]) coverList.append([]) depthList.append([]) coverList[count].append(cover[1]) depthList[count].append(cover[2]) count += 1 coverFile.close()
If HetsVCF is set to 'True', the heterozygous SNP calls will be written to a VCF example: python finalFilter.py <raw>.vfc <q30>.vcf <outHetFile> <HetsVCF> Created: 24/01/2013 Modified: 13/11/2015 author: David Edwards """ import sys from pipe_utils import splitPath inFile = sys.argv[1] outFile = sys.argv[2] (prefix, middle, ext) = splitPath(outFile) outHetFile = sys.argv[3] + "/" + middle[:-3] + "het.txt" if sys.argv[4] == "True": HetsVCF = True else: HetsVCF = False vcfIn = open(inFile) vcfOut = open(outFile, "w") hetOut = open(outHetFile, "w") if HetsVCF: outHetPosFile = prefix + "/" + middle[:-3] + "het.vcf" hetVcfOut = open(outHetPosFile, "w") hetCount = 0 for line in vcfIn:
example: python deriveRepStats.py <isolate>_rep_cover.txt replicon depth_fail cover_fail runType mapped_fail check_reads_mapped Created: 29042013 Modified: 16072014 author: David Edwards ''' import sys from pipe_utils import splitPath output_RepStats = "" repCoverFile = sys.argv[1] repCover = open(repCoverFile, "r") (prefix, middle, ext) = splitPath(repCoverFile) seq_name = middle[:-10] output_RepStats += seq_name +"\t" replicon = sys.argv[2] if replicon.find('.') != -1: temp_rep = replicon.split('.') replicon = temp_rep[0] replicon_names = [] #get % cover and depth for each replicon depth_test_value = 0.0 for line in repCover: entry = line.split() replicon_names.append(entry[0])
Last Modified: 17/02/2015 - fixed locus tag designation author: David Edwards ''' import sys, glob from pipe_utils import splitPath, get_key from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.Alphabet import IUPAC outPath = sys.argv[1] genbankName = sys.argv[2] coverFileName = sys.argv[3] (prefix, name, ext) = splitPath(coverFileName) outCover = outPath + name[:-9] + "/" + name[:-9] + "_CoverDepthMatrix.txt" geneList = [] geneCoverList = [] geneDepthList = [] totalBases = 0 repliconList =[] records = SeqIO.parse(genbankName, "genbank") for record in records: feature_count = 0 for f in record.features: if f.type == "CDS": # if f.type == "misc_feature": feature_count += 1
example: python getSNPList.py <statsFile> <replicon> <output> Created: 12052013 Modified: 15042014 author: David Edwards ''' import sys, glob from pipe_utils import splitPath statsFile_name = sys.argv[1] replicon = sys.argv[2] outputFile_name = sys.argv[3] outputFile = open(outputFile_name, "w") finalList = [] (prefix, name, ext) = splitPath(statsFile_name) vcfs = prefix + '/vcf/*_' + replicon + '_q30.vcf' for file in glob.glob(vcfs): (prefix, name, ext) = splitPath(file) name = name[:(-1*(5+len(replicon)))] statsFile = open(statsFile_name) for sample in statsFile: if sample.startswith("Isolate") != True: splitSample = sample.split("\t") if (name == splitSample[0]) and (splitSample[-1].startswith("f") != True): vcfFile = open(file) varList = [] mergeList = [] for line in vcfFile: if line.startswith("#") == False: splitLine = line.split("\t")
for line in snpList: splitLine = line.split() SNP.append([splitLine[0]]) snpList.close() #populate reference calls from reference fasta references = SeqIO.parse(reference_name, "fasta") for reference in references: if reference.name == replicon: for call in range(len(SNP)): base = reference.seq[int(SNP[call][0])-1] SNP[call].append(base.upper()) # call bases for each SNP from consensus file of the strain (prefix, name, ext) = splitPath(consensus_in) if merge_prefix == '-': warning_file = prefix[:-4] + replicon +'_'+name +'_warning.txt' else: warning_file = merge_prefix + replicon +'_'+name +'_warning.txt' name = name[:-4] consensus = SeqIO.parse(consensus_in, "fastq") statsFile = open(stats) for sample in statsFile: if sample.startswith("Isolate") != True: splitSample = sample.split("\t") record_found = False if (name == splitSample[0]) and (splitSample[-1].startswith("f") != True): header = header + ',' + name for record in consensus: if record.name == replicon:
example: python mergeRepStats.py <new_replicon_RepStats.tab> sdOutgroupMutiplier reads_to_replace <mergeDirectory> runType Created: 23/10/2012 Modified: 28/10/2013 to mergeRepStats from mergeStats 15/04/2014 changed to produce outgroup.txt file if there are any outgroups to report 20/05/2014 fix to outgroup reporting 04/07/2014 change to replace_reads handling author: David Edwards ''' import sys, glob from pipe_utils import splitPath inFileName = sys.argv[1] (inPrefix, inName, inExt) = splitPath(inFileName) sdOutgroupMutiplier = int(sys.argv[2]) mergeFileName = sys.argv[4] + inName + inExt replace = sys.argv[3] runType = sys.argv[5] outgroup_outfile_name = sys.argv[4] + inName[:-9] + '_outgroups.txt' outgroups = [] average = 0 sd = 0 count = 0 output = "" # Combine the two sets # note: only need to count SNPs for 'phylogeny' runType
parser.add_option("-i", "--input_file", action="store", dest="input_file", help="allele table (required)", default="") parser.add_option("-o", "--output_file", action="store", dest="output_file", help="name for output file (no ext, only required if '_alleles' missing from input name), Note: if '_alleles(_*)' ending present, this is removed and '_SNP_diff.nxs' added [setting for RedDog pipeline] (default: none)", default="") parser.add_option("-d", "--directory", action="store", dest="directory", help="directory to send output files (default: none)", default="") return parser.parse_args() if __name__ == "__main__": (options, args) = main() ### MAIN PROCESS inputName = options.input_file inputFile = open(inputName) (prefix, name, ext) = splitPath(inputName) strains = [] if name.find('_alleles') != -1: split_name = name.split('_') split = 0 refName = split_name[split] split += 1 while split_name[split] != 'alleles': refName += '_' + split_name[split] split += 1 directory = options.directory if directory == '': if prefix != '': outName = prefix + '/' + refName + '_SNP_diff.nxs'