def main(): ########################################## #Construct the arguments. ########################################## usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \ " <fasta file>: fasta sequence to annotate\n" description = "Ensure sequence names contain only alphanumeric characters\n" parser = OptionParser(usage=usage, description=description) options, args = parser.parse_args() if len(args) != 2: parser.print_help() return 1 inputName = args[0] inputFile = open(inputName, "r") outputName = args[1] outputFile = open(outputName, "w") for header, seq in fastaRead(inputFile): fastaWrite(outputFile, fixHeader(header), seq) outputFile.close() inputFile.close() return 0
def run(self, params="-s 2 -T 0 -Q 0 -a 1"): localReferenceFastaFile = os.path.join( self.getLocalTempDir(), "ref.fa" ) #Because we don't want to have any crufty files created in the local temp dir. indexFile = os.path.join(self.getLocalTempDir(), "my-index") #Index file mafFile = os.path.join(self.getLocalTempDir(), "out.maf") #MAF file #Hack to make last work, creating SQ line fH = open(self.outputSamFile, 'w') for name, seq in fastaRead(open(self.referenceFastaFile, 'r')): fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq))) fH.close() #Make fasta file, as last fastq seems broken localReadFile = os.path.join(self.getLocalTempDir(), "reads.fa") #Index file fH = open(localReadFile, 'w') for name, seq, quals in fastqRead(self.readFastqFile): fastaWrite(fH, name, seq) fH.close() system("cp %s %s" % (self.referenceFastaFile, localReferenceFastaFile)) #Copy across the ref file system("lastdb %s %s" % (indexFile, localReferenceFastaFile)) #Build the index system( "lastal %s %s %s > %s" % (params, indexFile, localReadFile, mafFile)) #Build the alignment system("maf-convert.py sam %s >> %s" % (mafFile, self.outputSamFile)) #Now convert sam file
def getFastaDict(self): temp = getTempFile(rootDir=self.getGlobalTempDir()) system("hal2fasta %s %s > %s" % (self.halPath, self.genome, temp)) ret = {} for header, seq in fastaRead(temp): ret[header] = seq return ret
def testFastaReadWriteC(self): """Tests consistency with C version of this function. """ tempFile = getTempFile() self.tempFiles.append(tempFile) tempFile2 = getTempFile() self.tempFiles.append(tempFile2) for test in range(0, self.testNo): fastaNumber = random.choice(range(10)) l = [getRandomSequence() for i in range(fastaNumber)] fileHandle = open(tempFile, 'w') for name, seq in l: fastaWrite(fileHandle, name, seq) fileHandle.close() command = "sonLib_fastaCTest %s %s" % (tempFile, tempFile2) print(command) system(command) fileHandle = open(tempFile2, 'r') l.reverse() outFh = io.StringIO() for i in fastaRead(fileHandle): name, seq = i assert i == l.pop() fastaWrite(outFh, name, seq) outFh.close() fileHandle.close()
def getSequences(sequenceFile): sequences = {} fileHandle = open(sequenceFile, "r") for header, sequence in fastaRead(fileHandle): sequences[header] = sequence fileHandle.close() return sequences
def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None): """Gets inputs (based on Blanchette region 0) that have weird header names that might get parsed wrong and cause issues.""" sequences, newickTreeString = getCactusInputs_blanchette( regionNumber=regionNumber) # Assign weird header names if tempDir is None: tempDir = getTempDir() # Should also consider "bar foo", "ba rfoo", but we currently # throw away everything but the first token (probably because of # cigar parsing). funkyHeaderNames = [ 'id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar' ] funkyIndex = 0 for i, sequencePath in enumerate(sequences): newPath = os.path.join(tempDir, str(i)) for _, sequence in fastaRead(sequencePath): header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)] funkyIndex += 1 fastaWrite(newPath, header, sequence, 'a') sequences[i] = newPath return sequences, newickTreeString
def run(self, params="-s 2 -T 0 -Q 0 -a 1"): localReferenceFastaFile = os.path.join(self.getLocalTempDir(), "ref.fa") #Because we don't want to have any crufty files created in the local temp dir. indexFile = os.path.join(self.getLocalTempDir(), "my-index") #Index file mafFile = os.path.join(self.getLocalTempDir(), "out.maf") #MAF file #Hack to make last work, creating SQ line fH = open(self.outputSamFile, 'w') for name, seq in fastaRead(open(self.referenceFastaFile, 'r')): fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq))) fH.close() #Make fasta file, as last fastq seems broken localReadFile = os.path.join(self.getLocalTempDir(), "reads.fa") #Index file fH = open(localReadFile, 'w') for name, seq, quals in fastqRead(self.readFastqFile): fastaWrite(fH, name, seq) fH.close() system("cp %s %s" % (self.referenceFastaFile, localReferenceFastaFile)) #Copy across the ref file system("lastdb %s %s" % (indexFile, localReferenceFastaFile)) #Build the index system("lastal %s %s %s > %s" % (params, indexFile, localReadFile, mafFile)) #Build the alignment system("maf-convert.py sam %s >> %s" % (mafFile, self.outputSamFile)) #Now convert sam file
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue(header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def processSequence(self, eventName, sequencePath): fileHandle = open(sequencePath, "r") for header, sequence in fastaRead(fileHandle): fixedHeader = fixHeader(header, event=eventName.replace(".", "_")) print (header, fixedHeader, eventName) if header in self.nameMap: assert self.nameMap[header] == fixedHeader else: self.nameMap[header] = fixedHeader
def getFastasFromSequence(sequenceDirs): #Get the sequences fastaSeqs = [] for sequenceDir in sequenceDirs: for fastaFile in os.listdir(sequenceDir): fileHandle = open(os.path.join(sequenceDir, fastaFile), 'r') for name, sequence in fastaRead(fileHandle): fastaSeqs.append((name, sequence)) fileHandle.close() return fastaSeqs
def run(self, args=""): tempFastqFile = os.path.join(self.getLocalTempDir(), "temp.fastq") normaliseQualValues(self.readFastqFile, tempFastqFile) system("lastz %s[multiple] %s %s --format=sam > %s" % (self.referenceFastaFile, tempFastqFile, args, self.outputSamFile)) try: pysam.Samfile(self.outputSamFile, "r" ).close() except ValueError: #Hack to make lastz work, creating SQ lines when no alignments are found fH = open(self.outputSamFile, 'a') for name, seq in fastaRead(open(self.referenceFastaFile, 'r')): fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq))) fH.close()
def getLowerCaseBases(sequenceFile): #Counts lower case bases in fasta sequences from sonLib.bioio import fastaRead totalMasked = 0 total = 0 fileHandle = open(sequenceFile, "r") for header, sequence in fastaRead(fileHandle): for base in sequence: if base != base.upper(): totalMasked += 1 total += len(sequence) fileHandle.close() return total, totalMasked
def build_pos_map(): # build a map of alignment positions to sequence positions r = {name: seq for name, seq in fastaRead("/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta")} r_sort = sorted(r.iteritems(), key=lambda x: x[0]) names, seqs = zip(*r_sort) tgt_is = {n: 0 for n in names} pos_map = defaultdict(dict) for ref_i, cs in enumerate(zip(*seqs)): for name, tgt_i in tgt_is.iteritems(): pos_map[name][ref_i] = tgt_i for name, c in zip(*[names, cs]): if c != "-": tgt_is[name] += 1 return pos_map
def run(self, args=""): tempFastqFile = os.path.join(self.getLocalTempDir(), "temp.fastq") normaliseQualValues(self.readFastqFile, tempFastqFile) system( "lastz %s[multiple] %s %s --format=sam > %s" % (self.referenceFastaFile, tempFastqFile, args, self.outputSamFile)) try: pysam.Samfile(self.outputSamFile, "r").close() except ValueError: #Hack to make lastz work, creating SQ lines when no alignments are found fH = open(self.outputSamFile, 'a') for name, seq in fastaRead(open(self.referenceFastaFile, 'r')): fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq))) fH.close()
def main(): ########################################## #Construct the arguments. ########################################## usage = "usage: %prog [options] <fasta input file>\n\n" + \ " <fasta file>: fasta sequence to check for unique headers\n" description = "Ensure sequence names are unique\n" parser = OptionParser(usage=usage, description=description) parser.add_option("--checkAlphaNumeric", dest="checkAlphaNumeric", action="store_true", help="Checks that the first word contains only alphanumeric characters, periods or underscores.", default=False) parser.add_option("--checkUCSCNames", dest="checkUCSC", action="store_true", help="Checks that suffix of the first word after the last '.' character contains only alpha-numeric characters or underscores and is unique. This is useful if exporting to MAF, where sequences are named 'genome.chr'.", default=False) parser.add_option("--checkAssemblyHub", dest="checkAssemblyHub", action="store_true", help="Checks that the first word " "of each header is able to be used in a UCSC Assembly " "Hub.") options, args = parser.parse_args() if len(args) != 1: parser.print_help() return 1 inputName = args[0] inputFile = open(inputName, "r") seen = set() for header, seq in fastaRead(inputFile): mungedHeader = header.split()[0] if options.checkAlphaNumeric and "".join([ i for i in mungedHeader if str.isalnum(i) ]) != mungedHeader: #Check is only alpha numeric raise RuntimeError("We found a non-alpha numeric character in the fasta header, and the config file (checkAlphaNumeric option) demands that all fasta headers be alpha numeric: %s" % header) if options.checkUCSC: mungedHeader = mungedHeader.split('.')[-1] if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':') ]) != mungedHeader: raise RuntimeError("We found a non-alpha numeric, '-', ':' or '_' prefix in the fasta header (UCSC Names option), please modify the first word after the '>' and after the last '.' in every fasta header to only contain alpha-numeric, '_', ':' or '-' characters, or consider using a more lenient option like --checkForAssemblyHub. The offending header: %s" % header) if options.checkAssemblyHub: if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':' or i == ".") ]) != mungedHeader: raise RuntimeError("An invalid character was found in the first word of a fasta header. Acceptable characters for headers in an assembly hub include alphanumeric characters plus '_', '-', ':', and '.'. Please modify your headers to eliminate other characters. The offending header: %s" % header) if mungedHeader in seen: raise RuntimeError("We found a duplicated fasta header, the first word of each fasta header should be unique within each genome, as this is a requirement for the output HAL file or any MAF file subsequently created. Please modify the input fasta file. Offending duplicate header: %s" % header) seen.add(mungedHeader) inputFile.close() return 0
def build_pos_map(): # build a map of alignment positions to sequence positions r = { name: seq for name, seq in fastaRead( "/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta") } r_sort = sorted(r.iteritems(), key=lambda x: x[0]) names, seqs = zip(*r_sort) tgt_is = {n: 0 for n in names} pos_map = defaultdict(dict) for ref_i, cs in enumerate(zip(*seqs)): for name, tgt_i in tgt_is.iteritems(): pos_map[name][ref_i] = tgt_i for name, c in zip(*[names, cs]): if c != "-": tgt_is[name] += 1 return pos_map
def main(): ########################################## #Construct the arguments. ########################################## usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \ " <fasta file>: fasta sequence to filter\n" description = "Ensure sequences have length >= length\n" parser = OptionParser(usage=usage, description=description) parser.add_option("--prefix", dest="prefix", type="string", help="only filter sequences with prefix in name", default="") parser.add_option("--length", dest="length", type="int", help="filter shorter than length [default=1000]", default=1000) options, args = parser.parse_args() if len(args) != 2: parser.print_help() return 1 inputName = args[0] inputFile = open(inputName, "r") outputName = args[1] outputFile = open(outputName, "w") contTable = containedSequences(inputFile) inputFile.seek(0) for header, seq in fastaRead(inputFile): if tooShort(header, seq, options, contTable) == False: fastaWrite(outputFile, header, seq) outputFile.close() inputFile.close() return 0
def testFastaReadWrite(self): tempFile = getTempFile() self.tempFiles.append(tempFile) for test in range(0, self.testNo): fastaNumber = random.choice(range(10)) l = [getRandomSequence() for i in range(fastaNumber)] fileHandle = open(tempFile, 'w') for name, seq in l: fastaWrite(fileHandle, name, seq) fileHandle.close() fileHandle = open(tempFile, 'r') l.reverse() outFh = io.StringIO() for i in fastaRead(fileHandle): assert i == l.pop() name, seq = i fastaWrite(outFh, name, seq) outFh.close() fileHandle.close()
def checkUniqueHeaders(inputFile, checkAlphaNumeric=False, checkUCSC=False, checkAssemblyHub=True): """Check that headers are unique and meet certain requirements.""" seen = set() for header, seq in fastaRead(inputFile): if " " in header or "\t" in header: raise RuntimeError( "The fasta header '%s' contains spaces or tabs. These characters will cause issues in space-separated formats like MAF, and may not function properly when viewed in a browser. Please remove these characters from the input headers and try again." % header) mungedHeader = header.split()[0] if checkAlphaNumeric and "".join([ i for i in mungedHeader if str.isalnum(i) ]) != mungedHeader: #Check is only alpha numeric raise RuntimeError( "We found a non-alpha numeric character in the fasta header, and the config file (checkAlphaNumeric option) demands that all fasta headers be alpha numeric: %s" % header) if checkUCSC: mungedHeader = mungedHeader.split('.')[-1] if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':') ]) != mungedHeader: raise RuntimeError( "We found a non-alpha numeric, '-', ':' or '_' prefix in the fasta header (UCSC Names option), please modify the first word after the '>' and after the last '.' in every fasta header to only contain alpha-numeric, '_', ':' or '-' characters, or consider using a more lenient option like --checkForAssemblyHub. The offending header: %s" % header) if checkAssemblyHub: if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':' or i == ".") ]) != mungedHeader: raise RuntimeError( "An invalid character was found in the first word of a fasta header. Acceptable characters for headers in an assembly hub include alphanumeric characters plus '_', '-', ':', and '.'. Please modify your headers to eliminate other characters. The offending header: %s" % header) if mungedHeader in seen: raise RuntimeError( "We found a duplicated fasta header, the first word of each fasta header should be unique within each genome, as this is a requirement for the output HAL file or any MAF file subsequently created. Please modify the input fasta file. Offending duplicate header: %s" % header) seen.add(mungedHeader)
def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None): """Gets inputs (based on Blanchette region 0) that have weird header names that might get parsed wrong and cause issues.""" sequences, newickTreeString = getCactusInputs_blanchette(regionNumber=regionNumber) # Assign weird header names if tempDir is None: tempDir = getTempDir() # Should also consider "bar foo", "ba rfoo", but we currently # throw away everything but the first token (probably because of # cigar parsing). funkyHeaderNames = ['id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar'] funkyIndex = 0 for i, sequencePath in enumerate(sequences): newPath = os.path.join(tempDir, str(i)) for _, sequence in fastaRead(sequencePath): header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)] funkyIndex += 1 fastaWrite(newPath, header, sequence, 'a') sequences[i] = newPath return sequences, newickTreeString
def containedSequences(inputFile): lookup = dict() prev = "" for header, seq in fastaRead(inputFile): if '|1|' not in header: assert len(lookup) == 0 return None else: idx = header.find('|1|') name = header[:idx] offset = header[idx+3:] if offset.isdigit() == False: assert len(lookup) == 0 return None if int(offset) == 0: assert lookup.has_key(name) == False lookup[name] = (len(seq), False) elif lookup.has_key(name) == True: lookup[name] = (max(lookup[name][0], int(offset) + len(seq)), lookup[name][1]) if name != prev and lookup.has_key(prev): lookup[prev] = (lookup[prev][0], True) prev = name return lookup
def mutateReferenceSequences(referenceFastaFiles): updatedReferenceFastaFiles = referenceFastaFiles[:] for referenceFastaFile in referenceFastaFiles: if not "percent" in referenceFastaFile: mutation_rates = [0.01, 0.05, 0.10, 0.20] for mutation_rate in mutation_rates: indel_rate = 0.0 * mutation_rate # indel rate = 20% of Substitution rate i = mutation_rate * 100 j = indel_rate * 100 newReferenceFastaFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta" mutationIndexFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta_Index.txt" updatedReferenceFastaFiles.append(newReferenceFastaFile) if not os.path.exists(newReferenceFastaFile): fH = open(newReferenceFastaFile, 'w') fH2 = open(mutationIndexFile, 'w') for header, seq in fastaRead(referenceFastaFile): header = header.split()[0] mutatedSeq = mutateSequence(seq, mutation_rate) fastaWrite(fH, header, mutatedSeq) fastaWrite(fH2, header, seq) fastaWrite(fH2, header + "_mutated", mutatedSeq) fH.close() fH2.close() return updatedReferenceFastaFiles
def containedSequences(inputFile): lookup = dict() prev = "" for header, seq in fastaRead(inputFile): if '|1|' not in header: assert len(lookup) == 0 return None else: idx = header.find('|1|') name = header[:idx] offset = header[idx + 3:] if offset.isdigit() == False: assert len(lookup) == 0 return None if int(offset) == 0: assert (name in lookup) == False lookup[name] = (len(seq), False) elif (name in lookup) == True: lookup[name] = (max(lookup[name][0], int(offset) + len(seq)), lookup[name][1]) if name != prev and prev in lookup: lookup[prev] = (lookup[prev][0], True) prev = name return lookup
def lengthWithoutGaps(seq): return len([i for i in seq if i != '-']) if __name__ == '__main__': # Parse args if len(sys.argv) < 3: print __doc__ sys.exit(1) newickPath = sys.argv[1] fastaPath = sys.argv[2] treeString = open(newickPath).read().split("\n")[0].strip() tree = NXNewick().parseString(treeString) sequences = {} for name, seq in fastaRead(open(fastaPath)): sequences[name] = seq # Print MAF, with sequence lines in post-order. print '##maf version=1 scoring=NA' print 'a tree="%s"' % (treeString) for nodeId in tree.postOrderTraversal(): if not tree.isLeaf(nodeId): continue nodeName = tree.getName(nodeId) if nodeName not in sequences: raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName)) seq = sequences[nodeName] seqLen = lengthWithoutGaps(seq) print 's %s 0 %d + %d %s' % (nodeName, seqLen, seqLen, seq) # mafValidator wants an empty closing line(?)
for exclude, region in regions.iteritems(): for start, stop in region: raw_recs.extend([x for x in results[exclude] if start < int(x[1]) <= stop]) # region with poor alignment exclude_regions = [[28574, 31093]] exclude_regions = [ChromosomeInterval('a', x[0], x[1], '.') for x in exclude_regions] recs = [] for r in raw_recs: i = ChromosomeInterval('a', int(r[1]), int(r[1]) + 1, '.') if not any([i.overlap(x) for x in exclude_regions]): recs.append(r) # build a map of alignment positions to sequence positions r = {name: seq for name, seq in fastaRead("stitched_alignment.fa")} r_sort = sorted(r.iteritems(),key=lambda x: x[0]) names, seqs = zip(*r_sort) tgt_is = {n: 0 for n in names} pos_map = defaultdict(dict) for ref_i, cs in enumerate(zip(*seqs)): for name, tgt_i in tgt_is.iteritems(): pos_map[name][ref_i] = tgt_i for name, c in zip(*[names, cs]): if c != "-": tgt_is[name] += 1
import sys from sonLib.bioio import fastaRead speciesMap = {'humanZnfCluster':'human', 'chimpZnfCluster':'chimp', 'gorillaZnfCluster':'gorilla', 'rhesusZnfCluster.fa':'rhesus', 'orangZnfCluster':'orang'} for header, _ in fastaRead(open(sys.argv[1])): name = header.split("_")[0] print "%s\t%s" % (header, speciesMap[name])
header.start += len( subSequence ) + lenNs sequence = sequence[m.start() + lenNs: ] m = re.search( pattern, sequence ) i = fn2(header, searchedSeq + sequence) if i != None: yield i #=========== MAIN ==================== fH = open(sys.argv[1], 'r') fH2 = open(sys.argv[2], 'w') lengthOfNs = int(sys.argv[3]) lengthOfFragment = int(sys.argv[4]) if len(sys.argv) == 6: setLogLevel(sys.argv[5]) headers = set() for name, sequence in fastaRead(fH): header = Header( name.split()[0], len(sequence) ) logger.info("Got a sequence of length %i with header %s for processing" % (len(sequence), name.split()[0])) for newheader, subsequence in fn( header, sequence, lengthOfNs ): if len( subsequence ) > 0: logger.info("Writing out a sequence of length %i with header %s" % (len(subsequence), newheader)) assert newheader not in headers headers.add(newheader) fastaWrite(fH2, newheader, subsequence) fH.close() fH2.close()
from sonLib.bioio import fastaRead if "--help" in sys.argv[1:] or len(sys.argv) == 1: print "Script to create a bed file containing the intervals of repeat bases in a fasta file." print "Usage: fastaFile outputBedFile" sys.exit(0) def fn(header): return header.split()[0] def fn2(sequence): fn = lambda x : x in [ 'a', 'c', 't', 'g', 'N', 'n'] i = 0 while i < len(sequence): if fn(sequence[i]): j = i+1 while j<len(sequence) and fn(sequence[j]): j+=1 yield i, j i = j else: i+=1 fileHandle = open(sys.argv[2], 'w') for sequenceFile in sys.argv[1].split(): for header, sequence in fastaRead(open(sequenceFile, 'r')): sequenceName = fn(header) for start, stop, in fn2(sequence): fileHandle.write("%s\t%i\t%i\n" % (sequenceName, start, stop)) fileHandle.close()
import sys import xml.etree.ElementTree as ET from sonLib.bioio import fastaRead, fastaWrite node = ET.parse(sys.argv[1]).getroot() fH = open(sys.argv[3], 'w') seqs = [ i for i in fastaRead(open(sys.argv[2], 'r')) ] assert(len(seqs) == 1) for name, sequence in seqs: #>hg19.chr6.171115067.28377796.5150977.1 i = name.split(".") j = int(node.attrib["minOtherReferenceCoordinate"]) k = int(node.attrib["maxOtherReferenceCoordinate"]) fastaWrite(fH, ".".join(i[0:3] + [ str(int(i[3]) + j), str(k - j)] + i[-1:]), sequence[j:k]) fH.close()
read_map_holder[para][tag] = find_read_sun_intersections(reads, vcf_recs, bam_handle) from sonLib.bioio import fastaRead from collections import OrderedDict # We need the start positions for each in the actual genome, used a browser BLAT start_pos = {"Notch2": 120087516, "Notch2NL-A": 146248223, "Notch2NL-B": 148698969, "Notch2NL-C": 149374496, "Notch2NL-D": 120707775} # which of these are backwards? backwards = {"Notch2NL-C", "Notch2NL-D"} names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C', 'Notch2NL-D'] # same as in VCF # build a map of alignment positions to sequence positions r = {name: seq for name, seq in fastaRead("/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta")} r_sort = sorted(r.iteritems(),key=lambda x: x[0]) names, seqs = zip(*r_sort) tgt_is = {n: 0 for n in names} pos_map = defaultdict(dict) for ref_i, cs in enumerate(zip(*seqs)): for name, tgt_i in tgt_is.iteritems(): pos_map[name][ref_i] = tgt_i for name, c in zip(*[names, cs]): if c != "-": tgt_is[name] += 1 # invert pos_map
#!/usr/bin/env python # Usage: pastaIdsToOriginalNames.py fastaFile renameFile import sys from sonLib.bioio import system, fastaRead, fastaWrite fastaFile = sys.argv[1] renameFile = sys.argv[2] curRealName = None curPastaID = None translate = {} for i, line in enumerate(open(renameFile)): line = line.strip() if i % 3 == 0: curPastaID = line elif i % 3 == 1: curRealName = line else: translate[curPastaID] = curRealName for header, seq in fastaRead(open(fastaFile)): # hacks for if we are using the badly-named original fasta. header = translate[header].replace("...", ".-.").replace(".", "_").replace("__", "_") fastaWrite(sys.stdout, header, seq)
return string.translate(s, rc)[::-1] # We need the start positions for each in the actual genome, used a browser BLAT start_pos = {"Notch2": 120087516, "Notch2NL-A": 146248223, "Notch2NL-B": 148698969, "Notch2NL-C": 149374496, "Notch2NL-D": 120707775} # which of these are backwards? backwards = {"Notch2NL-C", "Notch2NL-D"} names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C', 'Notch2NL-D'] # same as in VCF header = "##fileformat=VCFv4.1" fields = "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + names) rec_template = "Notch2\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\tGT\t{gts}\n" # build a map of alignment positions to sequence positions r = {name: seq for name, seq in fastaRead("notch2_aligned.fasta")} r_sort = sorted(r.iteritems(),key=lambda x: x[0]) names, seqs = zip(*r_sort) tgt_is = {n: 0 for n in names} pos_map = defaultdict(dict) for ref_i, cs in enumerate(zip(*seqs)): for name, tgt_i in tgt_is.iteritems(): pos_map[name][ref_i] = tgt_i for name, c in zip(*[names, cs]): if c != "-": tgt_is[name] += 1 # now lets restructure the vcf to always make Notch2 the reference allele
def lengthWithoutGaps(seq): return len([i for i in seq if i != '-']) if __name__ == '__main__': # Parse args if len(sys.argv) < 3: print __doc__ sys.exit(1) newickPath = sys.argv[1] fastaPath = sys.argv[2] treeString = open(newickPath).read().split("\n")[0].strip() tree = NXNewick().parseString(treeString) sequences = {} for header, seq in fastaRead(open(fastaPath)): fields = header.split('_') name = fields[0] start = fields[1] end = fields[2] strand = fields[3] sequences[header] = (seq, name, start, end, strand) # Print MAF, with sequence lines in post-order. print '##maf version=1 scoring=NA' print 'a tree="%s"' % (treeString) for nodeId in tree.postOrderTraversal(): if not tree.isLeaf(nodeId): continue nodeName = tree.getName(nodeId) if nodeName not in sequences:
#!/usr/bin/env python import sys from sonLib.bioio import fastaRead, fastaWrite # Format: genome => sequence => subsequence, start of subsequence, end # of subsequence genomeMap = {"hg19":{"chr19":("humanZnfCluster", 51927367, 54158296)}, "panTro4":{"chr19":("chimpZnfCluster", 56310088, 58563166)}, "gorGor3":{"chr19":("gorillaZnfCluster", 48765939, 51102984)}, "ponAbe2":{"chr19":("orangZnfCluster", 53063439, 55430961)}, "rheMac3":{"chr19":("rhesusZnfCluster", 57314791, 59488909)}} fasta = sys.argv[1] renameFile = open(sys.argv[2], 'w') for header, seq in fastaRead(open(fasta)): oldHeader = header fields = header.split("_") if len(fields) != 5: # some sequences have _'s in them (chrX_random_N) fields[0] = "_".join(fields[:len(fields)-4]) fields = [field for i, field in enumerate(fields) if i == 0 or i > len(fields) - 5] assert len(fields) == 5 chr = fields[0] start = int(fields[1]) end = int(fields[2]) strand = fields[3] genome = fields[4] if genome in genomeMap: if chr in genomeMap[genome]: subseq = genomeMap[genome][chr][0] subseqStart = genomeMap[genome][chr][1] subseqEnd = genomeMap[genome][chr][2] if start < subseqStart or end >= subseqEnd:
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list( map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue( header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
from sonLib.bioio import fastaRead, fastaWrite import sys import random fH = open(sys.argv[2], "w") def fn(k, i, j): if k.upper() == i.upper(): l = random.choice(j) if k == k.upper(): return l.upper() return l.lower() else: return k for name, seq in fastaRead(open(sys.argv[1], "r")): for i, j in [ ("W", ("A", "T")), ("S", ("C", "G")), ("M", ("A", "C")), ("K", ("G", "T")), ("R", ("A", "G")), ("Y", ("C", "T")), ("B", ("C", "G", "T")), ("D", ("A", "G", "T")), ("H", ("A", "C", "T")), ("V", ("A", "C", "G")) ]: seq = "".join([ fn(k, i, j) for k in seq ]) fastaWrite(fH, name, seq) fH.close()
import sys import xml.etree.ElementTree as ET from sonLib.bioio import fastaRead, fastaWrite node = ET.parse(sys.argv[1]).getroot() fH = open(sys.argv[3], 'w') seqs = [i for i in fastaRead(open(sys.argv[2], 'r'))] assert (len(seqs) == 1) for name, sequence in seqs: #>hg19.chr6.171115067.28377796.5150977.1 i = name.split(".") j = int(node.attrib["minOtherReferenceCoordinate"]) k = int(node.attrib["maxOtherReferenceCoordinate"]) fastaWrite(fH, ".".join(i[0:3] + [str(int(i[3]) + j), str(k - j)] + i[-1:]), sequence[j:k]) fH.close()
import sys import xml.etree.ElementTree as ET from sonLib.bioio import fastaRead, fastaWrite i = set([ i for i in ET.parse(sys.argv[1]).getroot().text.split() ]) fH = open(sys.argv[3], 'w') for name, sequence in fastaRead(open(sys.argv[2], 'r')): if name not in i: fastaWrite(fH, name, sequence) fH.close()
"Notch2NL-C": 149374496, "Notch2NL-D": 120707775 } # which of these are backwards? backwards = {"Notch2NL-C", "Notch2NL-D"} names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C', 'Notch2NL-D'] # same as in VCF header = "##fileformat=VCFv4.1" fields = "\t".join( ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + names) rec_template = "Notch2\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\tGT\t{gts}\n" # build a map of alignment positions to sequence positions r = {name: seq for name, seq in fastaRead("notch2_aligned.fasta")} r_sort = sorted(r.iteritems(), key=lambda x: x[0]) names, seqs = zip(*r_sort) tgt_is = {n: 0 for n in names} pos_map = defaultdict(dict) for ref_i, cs in enumerate(zip(*seqs)): for name, tgt_i in tgt_is.iteritems(): pos_map[name][ref_i] = tgt_i for name, c in zip(*[names, cs]): if c != "-": tgt_is[name] += 1 # now lets restructure the vcf to always make Notch2 the reference allele # as well as turning it into a homozygous call, and removing the depth. # finally, we filter for unique calls
from sonLib.bioio import fastaRead, fastaWrite import sys import random fH = open(sys.argv[2], "w") def fn(k, i, j): if k.upper() == i.upper(): l = random.choice(j) if k == k.upper(): return l.upper() return l.lower() else: return k for name, seq in fastaRead(open(sys.argv[1], "r")): for i, j in [("W", ("A", "T")), ("S", ("C", "G")), ("M", ("A", "C")), ("K", ("G", "T")), ("R", ("A", "G")), ("Y", ("C", "T")), ("B", ("C", "G", "T")), ("D", ("A", "G", "T")), ("H", ("A", "C", "T")), ("V", ("A", "C", "G"))]: seq = "".join([fn(k, i, j) for k in seq]) fastaWrite(fH, name, seq) fH.close()
"Notch2": 120087516, "Notch2NL-A": 146248223, "Notch2NL-B": 148698969, "Notch2NL-C": 149374496, "Notch2NL-D": 120707775 } # which of these are backwards? backwards = {"Notch2NL-C", "Notch2NL-D"} names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C', 'Notch2NL-D'] # same as in VCF # build a map of alignment positions to sequence positions r = { name: seq for name, seq in fastaRead( "/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta") } r_sort = sorted(r.iteritems(), key=lambda x: x[0]) names, seqs = zip(*r_sort) tgt_is = {n: 0 for n in names} pos_map = defaultdict(dict) for ref_i, cs in enumerate(zip(*seqs)): for name, tgt_i in tgt_is.iteritems(): pos_map[name][ref_i] = tgt_i for name, c in zip(*[names, cs]): if c != "-": tgt_is[name] += 1 # invert pos_map pos_map_inverted = defaultdict(dict)
def main(): ########################################## #Construct the arguments. ########################################## usage = "usage: %prog [options] <fasta input file>\n\n" + \ " <fasta file>: fasta sequence to check for unique headers\n" description = "Ensure sequence names are unique\n" parser = OptionParser(usage=usage, description=description) parser.add_option( "--checkAlphaNumeric", dest="checkAlphaNumeric", action="store_true", help= "Checks that the first word contains only alphanumeric characters, periods or underscores.", default=False) parser.add_option( "--checkUCSCNames", dest="checkUCSC", action="store_true", help= "Checks that suffix of the first word after the last '.' character contains only alpha-numeric characters or underscores and is unique. This is useful if exporting to MAF, where sequences are named 'genome.chr'.", default=False) parser.add_option("--checkAssemblyHub", dest="checkAssemblyHub", action="store_true", help="Checks that the first word " "of each header is able to be used in a UCSC Assembly " "Hub.") options, args = parser.parse_args() if len(args) != 1: parser.print_help() return 1 inputName = args[0] inputFile = open(inputName, "r") seen = set() for header, seq in fastaRead(inputFile): mungedHeader = header.split()[0] if options.checkAlphaNumeric and "".join([ i for i in mungedHeader if str.isalnum(i) ]) != mungedHeader: #Check is only alpha numeric raise RuntimeError( "We found a non-alpha numeric character in the fasta header, and the config file (checkAlphaNumeric option) demands that all fasta headers be alpha numeric: %s" % header) if options.checkUCSC: mungedHeader = mungedHeader.split('.')[-1] if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':') ]) != mungedHeader: raise RuntimeError( "We found a non-alpha numeric, '-', ':' or '_' prefix in the fasta header (UCSC Names option), please modify the first word after the '>' and after the last '.' in every fasta header to only contain alpha-numeric, '_', ':' or '-' characters, or consider using a more lenient option like --checkForAssemblyHub. The offending header: %s" % header) if options.checkAssemblyHub: if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':' or i == ".") ]) != mungedHeader: raise RuntimeError( "An invalid character was found in the first word of a fasta header. Acceptable characters for headers in an assembly hub include alphanumeric characters plus '_', '-', ':', and '.'. Please modify your headers to eliminate other characters. The offending header: %s" % header) if mungedHeader in seen: raise RuntimeError( "We found a duplicated fasta header, the first word of each fasta header should be unique within each genome, as this is a requirement for the output HAL file or any MAF file subsequently created. Please modify the input fasta file. Offending duplicate header: %s" % header) seen.add(mungedHeader) inputFile.close() return 0