def main_mergePep(args, stdout, stderr): msg = "Building the hash index" stderr.write(msg + "\n") hashIndex = pygenes.buildGeneTableFileIndex(args.input) msg = "Building the length index" stderr.write(msg + "\n") lengthIndex = pygenes.buildLengthFileIndex(hashIndex) hashToMerged = dict() with open(args.fasta, "w") as fo: for l in lengthIndex.keys(): msg = "Merging sequences of length " + str(l) stderr.write(msg + "\n") sequences = pygenes.gatherSequences(args.input, lengthIndex, l) mergedSequences = pygenes.mergeSequences(sequences, args.dissim) for (k, v) in mergedSequences.items(): originalHash = pygenes.md5hash(k) mergedHash = pygenes.md5hash(v) assert not hashToMerged.get(originalHash, False) hashToMerged[originalHash] = mergedHash newMerged = set(mergedSequences.values()) for seq in newMerged: fo.write(">" + pygenes.md5hash(seq) + "\n") fo.write(seq + "\n") with open(args.input, "r") as fi: with open("tmp." + args.output, "w") as fo: headers = fi.readline() headerElements = headers.lstrip("#").strip().split("\t") fo.write(headers) for line in fi: content = dict(zip(headerElements, line.strip().split("\t"))) content["mergedPeptideHash"] = hashToMerged[content["peptideHash"]] fo.write("\t".join([content[x] for x in headerElements]) + "\n") shutil.move("tmp." + args.output, args.output)
def main_splitHclust(args, stdout, stderr) : # Load conservation file and determine files to realign alnToSplit = set([]) with open(args.conservation, "r") as fi : for l in fi : if l.strip() != "" : fasta, cons = l.strip().split("\t") if float(cons) < args.threshold : alnToSplit.add(fasta) # Go through the input files if args.outDir is None : args.outDir = "." processedFile = 0 for fastaFile in args.input : total = str(len(alnToSplit)) if os.path.basename(fastaFile) in alnToSplit : processedFile += 1 stderr.write("Processing file " + os.path.basename(fastaFile) + " " + str(processedFile) + "/" + total + "\n") # Build mapping from peptide sequences to sequence names seqParser = SeqIO.parse(fastaFile, "fasta") seqRaw = [x for x in seqParser] seqs = dict() [seqs.update({x.description : str(x.seq)}) for x in seqRaw] pep2seqNames = collections.defaultdict(lambda : []) [pep2seqNames[v].append(k) for (k, v) in seqs.iteritems()] # Produce merged sequences uniqueSeqs = list(set(seqs.values())) stderr.write("Working with " + str(len(uniqueSeqs)) + " unique sequences\n") if (args.unique is None) or (len(uniqueSeqs) <= args.unique) : mergedSeqs = pygenes.mergeSequences(uniqueSeqs, maxDistance = args.dissim, stderr = stderr) # Build mapping from merged sequences to original peptide sequences merged2pep = collections.defaultdict(lambda : []) [merged2pep[v].append(k) for (k, v) in mergedSeqs.iteritems()] # Output for (i, v) in enumerate(merged2pep.values()) : outFile = os.path.join(args.outDir, (os.path.basename(fastaFile) + ".split" + str(i) + ".fa")) with open(outFile, "w") as fo : for originalPep in v : for seqName in pep2seqNames[originalPep] : fo.write(">" + seqName + "\n") fo.write(originalPep + "\n") if args.outDir == "." and not args.keep : os.remove(fastaFile) else : stderr.write("Too many unique sequences! File not processed\n") if args.outDir == "." and not args.keep : stderr.write("File deleted\n") os.remove(fastaFile) else : if args.outDir != "." : shutil.copy(fastaFile, os.path.join(args.outDir, os.path.basename(fastaFile)))