def importPrimerPairs(fastafile): primerfile = MultiFasta(fastafile) primers = primerfile.createPrimers( config['targeting']['bowtieindex']) # places in genome for primer in primers: primer.calcProperties() # calc Tm and GC # find pairs left_suffix, rite_suffix = ['F', 'f', 'L', 'l', '5', 'left'], ['R', 'r', '3', 'right'] pairs = [] for i in range(len(primers)): primerI = [ '_'.join(primers[i].name.split('_')[:-1]), primers[i].name.split('_')[-1] ] for j in range(i, len(primers)): if i == j: continue # skip same primer primerJ = [ '_'.join(primers[j].name.split('_')[:-1]), primers[j].name.split('_')[-1] ] if primerI[0] == primerJ[0]: if primerI[1] in left_suffix and primerJ[1] in rite_suffix: pairs.append([primers[i], primers[j]]) elif primerJ[1] in left_suffix and primerI[1] in rite_suffix: pairs.append([primers[j], primers[i]]) else: raise Exception('saasd') # return valid pairs return pairs
def importPrimerPairs(fastafile): primerfile = MultiFasta(fastafile) primers = primerfile.createPrimers(config['targeting']['bowtieindex']) # places in genome for primer in primers: primer.calcProperties() # calc Tm and GC # find pairs left_suffix, rite_suffix = ['F','f','L','l','5','left'],['R','r','3','right'] pairs = [] for i in range(len(primers)): primerI = [ '_'.join(primers[i].name.split('_')[:-1]), primers[i].name.split('_')[-1] ] for j in range(i,len(primers)): if i==j: continue # skip same primer primerJ = [ '_'.join(primers[j].name.split('_')[:-1]), primers[j].name.split('_')[-1] ] if primerI[0] == primerJ[0]: if primerI[1] in left_suffix and primerJ[1] in rite_suffix: pairs.append([primers[i], primers[j]]) elif primerJ[1] in left_suffix and primerI[1] in rite_suffix: pairs.append([primers[j], primers[i]]) else: raise Exception('saasd') # return valid pairs return pairs
def importPrimerPairs(inputfile, config, primer3=True): # read table/fasta primersets = defaultdict(list) # pair primersets primertags = {} # primer tags from table if not inputfile.split('.')[-1].startswith( 'fa'): # ignores duplicate sequence primerseqs = {} fastafile = 'import_' + fileMD5(inputfile)[:8] + '.fasta' with open(fastafile, 'w') as outfh: with open(inputfile) as infh: for i, line in enumerate(infh): if i == 0: minimalHeader = set([ 'primername', 'primerset', 'tag', 'sequence', 'vessel', 'well' ]) header = map(lambda x: x.lower(), line.rstrip().split('\t')) try: assert not minimalHeader.difference(set(header)) except: print >> sys.stderr, 'ERROR: Missing columns (%s)' % ','.join( list(minimalHeader.difference(set(header)))) raise Exception('FileHeaderError') else: f = map(lambda x: x.strip('"'), line.rstrip().split('\t')) l = dict(zip(header, f)) # remove tag from sequence if l['tag']: try: tagseqs = config['ordersheet']['sequencetags'][ l['tag']]['tags'] except: pass else: for t in tagseqs: if l['sequence'].startswith(t): l['sequence'] = l['sequence'][len(t):] break # store metadata and write fasta if l['primername'] in primerseqs.keys(): try: assert l['sequence'] == primerseqs[ l['primername']] assert l['tag'] == primertags[l['primername']] except: print >> sys.stderr, l['primername'] print >> sys.stderr, primerseqs[ l['primername']] print >> sys.stderr, primertags[ l['primername']] raise Exception('ImportFormattingError') else: print >> outfh, '>' + l['primername'] print >> outfh, l['sequence'] if l['primerset']: primersets[l['primername']].append(l['primerset']) primertags[l['primername']] = l['tag'] primerseqs[l['primername']] = l['sequence'] primerfile = MultiFasta(fastafile) else: primerfile = MultiFasta(inputfile) # set default tags for import for r in primerfile.references: primertags[r] = config['import']['tag'] print >> sys.stderr, "Placing primers on genome..." # Align primers to genome primers = primerfile.createPrimers(config['design']['bowtieindex'], \ delete=False,tags=primertags, \ tmThreshold=config['design']['mispriming']['minimaltm'], \ endMatch=config['design']['mispriming']['identity3prime']) # places in genome # pair primers (by name or by primerset) MAKE COPIES!!!! pairs = {} for p in primers: setnames = primersets[p.name] \ if p.name in primersets.keys() and len(primersets[p.name])>0 \ else [ parsePrimerName(p.name)[0] ] for setname in setnames: try: pairs[setname] except KeyError: try: pairs[setname] = PrimerPair([None, None], name=setname) except: print >> sys.stderr, '>>', primersets[ p.name], '|', p.name, '|', setnames, '<' raise except: raise # get primer orientation (might be wrong if guesses from name, will correct after) ## this basically just makes sure primers get paired (one fwd, one reverse) reverse = p.targetposition.reverse if p.targetposition else parsePrimerName( p.name)[1] < 0 try: if reverse and pairs[setname][1] is None: pairs[setname][1] = deepcopy(p) else: if pairs[setname][0] is None: pairs[setname][0] = deepcopy(p) else: assert pairs[setname][1] is None pairs[setname][1] = deepcopy(p) except: print >> sys.stderr, "ERROR: Primer pair strand conflict?" print >> sys.stderr, "PRIMER0", pairs[setname][0] print >> sys.stderr, "PRIMER1", pairs[setname][1] print >> sys.stderr, "REVERSE", reverse print >> sys.stderr, "SETNAME", setname print >> sys.stderr, "PRIMER", p.name, parsePrimerName(p.name) print >> sys.stderr, "PAIRS", pairs[setname] raise # check if any unpaired primers for k, v in pairs.items(): if not all(v): print >> sys.stderr, "WARNING: primer set %s is incomplete and skipped" % k del pairs[k] # prune ranks in primer3 mode (renames pair) if primer3: for p in pairs.values(): assert p[0].targetposition and p[ 1].targetposition # make sure target postiions are set p.pruneRanks() validPairs = pairs.values() else: # guess target if not set validPairs = [] print >> sys.stderr, 'Identifying correct amplicons for unplaced primer pairs...' for p in pairs.values(): if not p[0].targetposition or not p[1].targetposition: amplicons = p.amplicons(config['import']['ampliconsize'], autoreverse=True) if amplicons: shortest = sorted( amplicons, key=lambda x: len(x[2]))[0] # sort amplicons by size if len(amplicons) > 1: print >> sys.stderr, 'WARNING: multiple amplicons for {}. Assuming shortest ({}bp) is correct.'.format( p.name, str(len(shortest[2]))) p[0].targetposition = shortest[0] # m p[1].targetposition = shortest[1] # n validPairs.append(p) elif not primer3: # try to find amplicon by sequence matching if no amplicons from genome mapping with sufficient Tm refGenome = Genome(config['design']['genome']) # get new loci (one round) newLoci = [[], []] for mapped, query in [[0, 1], [1, 0]]: for l in p[mapped].loci: newLoci[query] += refGenome.primerMatch( l, p[query].seq, config['import']['ampliconsize']) # add new loci if not newLoci[0] and not newLoci[1]: print >> sys.stderr, 'WARNING: {} is not specific and not imported ({},{})'.format( p.name, len(p[0].loci), len(p[1].loci)) continue else: # add new loci for i, loc in enumerate(newLoci): p[i].loci += loc p[i].loci = list(set( p[i].loci)) # remove redundancy # store new amplicon amplicons = p.amplicons(config['import']['ampliconsize'], autoreverse=True) if amplicons: p[0].targetposition = amplicons[0][0] # m p[1].targetposition = amplicons[0][1] # n validPairs.append(p) else: print >> sys.stderr, '\n'.join( ["-> " + str(l) for l in p[0].loci]) print >> sys.stderr, '\n'.join( ["<- " + str(l) for l in p[1].loci]) print >> sys.stderr, 'WARNING: Primer set {} has no valid amplicons ({},{})'.format( p.name, len(p[0].loci), len(p[1].loci)) else: print >> sys.stderr, 'WARNING: Primer set {} does not produce a well-sized, unique amplicon ({},{})'.format( p.name, len(p[0].loci), len(p[1].loci)) else: validPairs.append(p) return validPairs
def importPrimerPairs(inputfile, config, primer3=True): # read table/fasta primersets = defaultdict(list) # pair primersets primertags = {} # primer tags from table if not inputfile.split(".")[-1].startswith("fa"): # ignores duplicate sequence primerseqs = {} fastafile = "import_" + fileMD5(inputfile)[:8] + ".fasta" with open(fastafile, "w") as outfh: with open(inputfile) as infh: for i, line in enumerate(infh): if i == 0: minimalHeader = set(["primername", "primerset", "tag", "sequence", "vessel", "well"]) header = map(lambda x: x.lower(), line.rstrip().split("\t")) try: assert not minimalHeader.difference(set(header)) except: print >> sys.stderr, "ERROR: Missing columns (%s)" % ",".join( list(minimalHeader.difference(set(header))) ) raise Exception("FileHeaderError") else: f = map(lambda x: x.strip('"'), line.rstrip().split("\t")) l = dict(zip(header, f)) # remove tag from sequence if l["tag"]: try: tagseqs = config["ordersheet"]["sequencetags"][l["tag"]]["tags"] except: pass else: for t in tagseqs: if l["sequence"].startswith(t): l["sequence"] = l["sequence"][len(t) :] break # store metadata and write fasta if l["primername"] in primerseqs.keys(): try: assert l["sequence"] == primerseqs[l["primername"]] assert l["tag"] == primertags[l["primername"]] except: print >> sys.stderr, l["primername"] print >> sys.stderr, primerseqs[l["primername"]] print >> sys.stderr, primertags[l["primername"]] raise Exception("ImportFormattingError") else: print >> outfh, ">" + l["primername"] print >> outfh, l["sequence"] if l["primerset"]: primersets[l["primername"]].append(l["primerset"]) primertags[l["primername"]] = l["tag"] primerseqs[l["primername"]] = l["sequence"] primerfile = MultiFasta(fastafile) else: primerfile = MultiFasta(inputfile) # set default tags for import for r in primerfile.references: primertags[r] = config["import"]["tag"] print >> sys.stderr, "Placing primers on genome..." # Align primers to genome primers = primerfile.createPrimers( config["design"]["bowtieindex"], delete=False, tags=primertags, tmThreshold=config["design"]["mispriming"]["minimaltm"], endMatch=config["design"]["mispriming"]["identity3prime"], ) # places in genome # pair primers (by name or by primerset) MAKE COPIES!!!! pairs = {} for p in primers: setnames = ( primersets[p.name] if p.name in primersets.keys() and len(primersets[p.name]) > 0 else [parsePrimerName(p.name)[0]] ) for setname in setnames: try: pairs[setname] except KeyError: try: pairs[setname] = PrimerPair([None, None], name=setname) except: print >> sys.stderr, ">>", primersets[p.name], "|", p.name, "|", setnames, "<" raise except: raise # get primer orientation (might be wrong if guesses from name, will correct after) ## this basically just makes sure primers get paired (one fwd, one reverse) reverse = p.targetposition.reverse if p.targetposition else parsePrimerName(p.name)[1] < 0 try: if reverse and pairs[setname][1] is None: pairs[setname][1] = deepcopy(p) else: if pairs[setname][0] is None: pairs[setname][0] = deepcopy(p) else: assert pairs[setname][1] is None pairs[setname][1] = deepcopy(p) except: print >> sys.stderr, "ERROR: Primer pair strand conflict?" print >> sys.stderr, "PRIMER0", pairs[setname][0] print >> sys.stderr, "PRIMER1", pairs[setname][1] print >> sys.stderr, "REVERSE", reverse print >> sys.stderr, "SETNAME", setname print >> sys.stderr, "PRIMER", p.name, parsePrimerName(p.name) print >> sys.stderr, "PAIRS", pairs[setname] raise # check if any unpaired primers for k, v in pairs.items(): if not all(v): print >> sys.stderr, "WARNING: primer set %s is incomplete and skipped" % k del pairs[k] # prune ranks in primer3 mode (renames pair) if primer3: for p in pairs.values(): assert p[0].targetposition and p[1].targetposition # make sure target postiions are set p.pruneRanks() validPairs = pairs.values() else: # guess target if not set validPairs = [] print >> sys.stderr, "Identifying correct amplicons for unplaced primer pairs..." for p in pairs.values(): if not p[0].targetposition or not p[1].targetposition: amplicons = p.amplicons(config["import"]["ampliconsize"], autoreverse=True) if amplicons: shortest = sorted(amplicons, key=lambda x: len(x[2]))[0] # sort amplicons by size if len(amplicons) > 1: print >> sys.stderr, "WARNING: multiple amplicons for {}. Assuming shortest ({}bp) is correct.".format( p.name, str(len(shortest[2])) ) p[0].targetposition = shortest[0] # m p[1].targetposition = shortest[1] # n validPairs.append(p) elif not primer3: # try to find amplicon by sequence matching if no amplicons from genome mapping with sufficient Tm refGenome = Genome(config["design"]["genome"]) # get new loci (one round) newLoci = [[], []] for mapped, query in [[0, 1], [1, 0]]: for l in p[mapped].loci: newLoci[query] += refGenome.primerMatch(l, p[query].seq, config["import"]["ampliconsize"]) # add new loci if not newLoci[0] and not newLoci[1]: print >> sys.stderr, "WARNING: {} is not specific and not imported ({},{})".format( p.name, len(p[0].loci), len(p[1].loci) ) continue else: # add new loci for i, loc in enumerate(newLoci): p[i].loci += loc p[i].loci = list(set(p[i].loci)) # remove redundancy # store new amplicon amplicons = p.amplicons(config["import"]["ampliconsize"], autoreverse=True) if amplicons: p[0].targetposition = amplicons[0][0] # m p[1].targetposition = amplicons[0][1] # n validPairs.append(p) else: print >> sys.stderr, "\n".join(["-> " + str(l) for l in p[0].loci]) print >> sys.stderr, "\n".join(["<- " + str(l) for l in p[1].loci]) print >> sys.stderr, "WARNING: Primer set {} has no valid amplicons ({},{})".format( p.name, len(p[0].loci), len(p[1].loci) ) else: print >> sys.stderr, "WARNING: Primer set {} does not produce a well-sized, unique amplicon ({},{})".format( p.name, len(p[0].loci), len(p[1].loci) ) else: validPairs.append(p) return validPairs