Example #1
0
def main_mergePep(args, stdout, stderr):
    msg = "Building the hash index"
    stderr.write(msg + "\n")
    hashIndex = pygenes.buildGeneTableFileIndex(args.input)
    msg = "Building the length index"
    stderr.write(msg + "\n")
    lengthIndex = pygenes.buildLengthFileIndex(hashIndex)
    hashToMerged = dict()
    with open(args.fasta, "w") as fo:
        for l in lengthIndex.keys():
            msg = "Merging sequences of length " + str(l)
            stderr.write(msg + "\n")
            sequences = pygenes.gatherSequences(args.input, lengthIndex, l)
            mergedSequences = pygenes.mergeSequences(sequences, args.dissim)
            for (k, v) in mergedSequences.items():
                originalHash = pygenes.md5hash(k)
                mergedHash = pygenes.md5hash(v)
                assert not hashToMerged.get(originalHash, False)
                hashToMerged[originalHash] = mergedHash
            newMerged = set(mergedSequences.values())
            for seq in newMerged:
                fo.write(">" + pygenes.md5hash(seq) + "\n")
                fo.write(seq + "\n")
    with open(args.input, "r") as fi:
        with open("tmp." + args.output, "w") as fo:
            headers = fi.readline()
            headerElements = headers.lstrip("#").strip().split("\t")
            fo.write(headers)
            for line in fi:
                content = dict(zip(headerElements, line.strip().split("\t")))
                content["mergedPeptideHash"] = hashToMerged[content["peptideHash"]]
                fo.write("\t".join([content[x] for x in headerElements]) + "\n")
    shutil.move("tmp." + args.output, args.output)
Example #2
0
def main_splitHclust(args, stdout, stderr) :
    # Load conservation file and determine files to realign
    alnToSplit = set([])
    with open(args.conservation, "r") as fi :
        for l in fi :
            if l.strip() != "" :
                fasta, cons = l.strip().split("\t")
                if float(cons) < args.threshold :
                    alnToSplit.add(fasta)
    # Go through the input files
    if args.outDir is None :
        args.outDir = "."
    processedFile = 0
    for fastaFile in args.input :
        total = str(len(alnToSplit))
        if os.path.basename(fastaFile) in alnToSplit :
            processedFile += 1
            stderr.write("Processing file " + os.path.basename(fastaFile) +
                         " " + str(processedFile) + "/" + total + "\n")
            # Build mapping from peptide sequences to sequence names
            seqParser = SeqIO.parse(fastaFile, "fasta")
            seqRaw = [x for x in seqParser]
            seqs = dict()
            [seqs.update({x.description : str(x.seq)}) for x in seqRaw]
            pep2seqNames = collections.defaultdict(lambda : [])
            [pep2seqNames[v].append(k) for (k, v) in seqs.iteritems()]
            # Produce merged sequences
            uniqueSeqs = list(set(seqs.values()))
            stderr.write("Working with " + str(len(uniqueSeqs)) +
                         " unique sequences\n")
            if (args.unique is None) or (len(uniqueSeqs) <= args.unique) :
                mergedSeqs = pygenes.mergeSequences(uniqueSeqs,
                                                    maxDistance = args.dissim,
                                                    stderr = stderr)
                # Build mapping from merged sequences to original peptide sequences
                merged2pep = collections.defaultdict(lambda : [])
                [merged2pep[v].append(k) for (k, v) in mergedSeqs.iteritems()]
                # Output
                for (i, v) in enumerate(merged2pep.values()) :
                    outFile = os.path.join(args.outDir,
                                           (os.path.basename(fastaFile) + ".split" +
                                            str(i) + ".fa"))
                    with open(outFile, "w") as fo :
                        for originalPep in v :
                            for seqName in pep2seqNames[originalPep] :
                                fo.write(">" + seqName + "\n")
                                fo.write(originalPep + "\n")
                if args.outDir == "." and not args.keep :
                    os.remove(fastaFile)
            else :
                stderr.write("Too many unique sequences! File not processed\n")
                if args.outDir == "." and not args.keep :
                    stderr.write("File deleted\n")
                    os.remove(fastaFile)
        else :
            if args.outDir != "." :
                shutil.copy(fastaFile,
                            os.path.join(args.outDir,
                                         os.path.basename(fastaFile)))