def main():
    ##########################################
    #Construct the arguments.
    ##########################################    
    
    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to annotate\n"
    description = "Ensure sequence names contain only alphanumeric characters\n" 
    parser = OptionParser(usage=usage, description=description)

    options, args = parser.parse_args()
    
    if len(args) != 2:
        parser.print_help()
        return 1
    
    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")
     
    for header, seq in fastaRead(inputFile):
        fastaWrite(outputFile, fixHeader(header), seq)
            
    outputFile.close()
    inputFile.close()
    return 0
Ejemplo n.º 2
0
    def run(self, params="-s 2 -T 0 -Q 0 -a 1"):
        localReferenceFastaFile = os.path.join(
            self.getLocalTempDir(), "ref.fa"
        )  #Because we don't want to have any crufty files created in the local temp dir.
        indexFile = os.path.join(self.getLocalTempDir(),
                                 "my-index")  #Index file
        mafFile = os.path.join(self.getLocalTempDir(), "out.maf")  #MAF file
        #Hack to make last work, creating SQ line
        fH = open(self.outputSamFile, 'w')
        for name, seq in fastaRead(open(self.referenceFastaFile, 'r')):
            fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq)))
        fH.close()

        #Make fasta file, as last fastq seems broken
        localReadFile = os.path.join(self.getLocalTempDir(),
                                     "reads.fa")  #Index file
        fH = open(localReadFile, 'w')
        for name, seq, quals in fastqRead(self.readFastqFile):
            fastaWrite(fH, name, seq)
        fH.close()

        system("cp %s %s" %
               (self.referenceFastaFile,
                localReferenceFastaFile))  #Copy across the ref file
        system("lastdb %s %s" %
               (indexFile, localReferenceFastaFile))  #Build the index
        system(
            "lastal %s %s %s > %s" %
            (params, indexFile, localReadFile, mafFile))  #Build the alignment
        system("maf-convert.py sam %s >> %s" %
               (mafFile, self.outputSamFile))  #Now convert sam file
Ejemplo n.º 3
0
 def getFastaDict(self):
     temp = getTempFile(rootDir=self.getGlobalTempDir())
     system("hal2fasta %s %s > %s" % (self.halPath, self.genome, temp))
     ret = {}
     for header, seq in fastaRead(temp):
         ret[header] = seq
     return ret
def main():
    ##########################################
    #Construct the arguments.
    ##########################################

    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to annotate\n"
    description = "Ensure sequence names contain only alphanumeric characters\n"
    parser = OptionParser(usage=usage, description=description)

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.print_help()
        return 1

    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")

    for header, seq in fastaRead(inputFile):
        fastaWrite(outputFile, fixHeader(header), seq)

    outputFile.close()
    inputFile.close()
    return 0
Ejemplo n.º 5
0
    def testFastaReadWriteC(self):
        """Tests consistency with C version of this function.
        """
        tempFile = getTempFile()
        self.tempFiles.append(tempFile)
        tempFile2 = getTempFile()
        self.tempFiles.append(tempFile2)
        for test in range(0, self.testNo):
            fastaNumber = random.choice(range(10))
            l = [getRandomSequence() for i in range(fastaNumber)]
            fileHandle = open(tempFile, 'w')
            for name, seq in l:
                fastaWrite(fileHandle, name, seq)
            fileHandle.close()

            command = "sonLib_fastaCTest %s %s" % (tempFile, tempFile2)

            print(command)

            system(command)

            fileHandle = open(tempFile2, 'r')
            l.reverse()
            outFh = io.StringIO()
            for i in fastaRead(fileHandle):
                name, seq = i
                assert i == l.pop()
                fastaWrite(outFh, name, seq)
            outFh.close()
            fileHandle.close()
Ejemplo n.º 6
0
 def getFastaDict(self):
     temp = getTempFile(rootDir=self.getGlobalTempDir())
     system("hal2fasta %s %s > %s" % (self.halPath, self.genome, temp))
     ret = {}
     for header, seq in fastaRead(temp):
         ret[header] = seq
     return ret
Ejemplo n.º 7
0
def getSequences(sequenceFile):
    sequences = {}
    fileHandle = open(sequenceFile, "r")
    for header, sequence in fastaRead(fileHandle):
        sequences[header] = sequence
    fileHandle.close()
    return sequences
Ejemplo n.º 8
0
def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None):
    """Gets inputs (based on Blanchette region 0) that have weird header names
    that might get parsed wrong and cause issues."""
    sequences, newickTreeString = getCactusInputs_blanchette(
        regionNumber=regionNumber)

    # Assign weird header names
    if tempDir is None:
        tempDir = getTempDir()
    # Should also consider "bar foo", "ba rfoo", but we currently
    # throw away everything but the first token (probably because of
    # cigar parsing).
    funkyHeaderNames = [
        'id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar'
    ]
    funkyIndex = 0
    for i, sequencePath in enumerate(sequences):
        newPath = os.path.join(tempDir, str(i))
        for _, sequence in fastaRead(sequencePath):
            header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)]
            funkyIndex += 1
            fastaWrite(newPath, header, sequence, 'a')
        sequences[i] = newPath

    return sequences, newickTreeString
Ejemplo n.º 9
0
def getSequences(sequenceFile):
    sequences = {}
    fileHandle = open(sequenceFile, "r")
    for header, sequence in fastaRead(fileHandle):
        sequences[header] = sequence
    fileHandle.close()
    return sequences
Ejemplo n.º 10
0
    def run(self, params="-s 2 -T 0 -Q 0 -a 1"):
        localReferenceFastaFile = os.path.join(self.getLocalTempDir(), "ref.fa")
        #Because we don't want to have any crufty files created in the local temp dir.

        indexFile = os.path.join(self.getLocalTempDir(), "my-index") #Index file

        mafFile = os.path.join(self.getLocalTempDir(), "out.maf") #MAF file

        #Hack to make last work, creating SQ line
        fH = open(self.outputSamFile, 'w')
        for name, seq in fastaRead(open(self.referenceFastaFile, 'r')):
            fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq)))
        fH.close()

        #Make fasta file, as last fastq seems broken
        localReadFile = os.path.join(self.getLocalTempDir(), "reads.fa") #Index file
        fH = open(localReadFile, 'w')
        for name, seq, quals in fastqRead(self.readFastqFile):
            fastaWrite(fH, name, seq)
        fH.close()

        system("cp %s %s" % (self.referenceFastaFile, localReferenceFastaFile)) #Copy across the ref file
        system("lastdb %s %s" % (indexFile, localReferenceFastaFile)) #Build the index
        system("lastal %s %s %s > %s" % (params, indexFile, localReadFile, mafFile)) #Build the alignment
        system("maf-convert.py sam %s >> %s" % (mafFile, self.outputSamFile)) #Now convert sam file
Ejemplo n.º 11
0
    def progressiveFunction(self, experimentFile, toilDir,
                            batchSystem, buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(header in headers[genome],
                                            'Header %s from output c2h %s not found in input fa %s'
                                            ' for genome %s' % (header, c2hPath, seqMap[genome], genome))


        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
 def processSequence(self, eventName, sequencePath):
     fileHandle = open(sequencePath, "r")
     for header, sequence in fastaRead(fileHandle):
         fixedHeader = fixHeader(header, event=eventName.replace(".", "_"))
         print (header, fixedHeader, eventName)
         if header in self.nameMap:
             assert self.nameMap[header] == fixedHeader
         else:
             self.nameMap[header] = fixedHeader
Ejemplo n.º 13
0
def getFastasFromSequence(sequenceDirs):
    #Get the sequences
    fastaSeqs = []
    for sequenceDir in sequenceDirs:
        for fastaFile in os.listdir(sequenceDir):
            fileHandle = open(os.path.join(sequenceDir, fastaFile), 'r')
            for name, sequence in fastaRead(fileHandle):
                fastaSeqs.append((name, sequence))
            fileHandle.close()
    return fastaSeqs
Ejemplo n.º 14
0
def getFastasFromSequence(sequenceDirs):
    #Get the sequences
    fastaSeqs = []
    for sequenceDir in sequenceDirs:
        for fastaFile in os.listdir(sequenceDir):
            fileHandle = open(os.path.join(sequenceDir, fastaFile), 'r')
            for name, sequence in fastaRead(fileHandle):
                fastaSeqs.append((name, sequence))
            fileHandle.close()
    return fastaSeqs
Ejemplo n.º 15
0
 def run(self, args=""):
     tempFastqFile = os.path.join(self.getLocalTempDir(), "temp.fastq")
     normaliseQualValues(self.readFastqFile, tempFastqFile)
     system("lastz %s[multiple] %s %s --format=sam > %s" % (self.referenceFastaFile, tempFastqFile, args, self.outputSamFile))
     try:
         pysam.Samfile(self.outputSamFile, "r" ).close()
     except ValueError:
         #Hack to make lastz work, creating SQ lines when no alignments are found
         fH = open(self.outputSamFile, 'a')
         for name, seq in fastaRead(open(self.referenceFastaFile, 'r')):
             fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq)))
         fH.close()
Ejemplo n.º 16
0
def getLowerCaseBases(sequenceFile):
    #Counts lower case bases in fasta sequences
    from sonLib.bioio import fastaRead
    totalMasked = 0
    total = 0
    fileHandle = open(sequenceFile, "r")
    for header, sequence in fastaRead(fileHandle):
        for base in sequence:
            if base != base.upper():
                totalMasked += 1
        total += len(sequence)
    fileHandle.close()
    return total, totalMasked
Ejemplo n.º 17
0
def getLowerCaseBases(sequenceFile):
    #Counts lower case bases in fasta sequences
    from sonLib.bioio import fastaRead
    totalMasked = 0
    total = 0
    fileHandle = open(sequenceFile, "r")
    for header, sequence in fastaRead(fileHandle):
        for base in sequence:
            if base != base.upper():
                totalMasked += 1
        total += len(sequence)
    fileHandle.close()
    return total, totalMasked
Ejemplo n.º 18
0
def build_pos_map():
# build a map of alignment positions to sequence positions
    r = {name: seq for name, seq in fastaRead("/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta")}
    r_sort = sorted(r.iteritems(), key=lambda x: x[0])
    names, seqs = zip(*r_sort)
    tgt_is = {n: 0 for n in names}
    pos_map = defaultdict(dict)
    for ref_i, cs in enumerate(zip(*seqs)):
        for name, tgt_i in tgt_is.iteritems():
            pos_map[name][ref_i] = tgt_i
        for name, c in zip(*[names, cs]):
            if c != "-":
                tgt_is[name] += 1
    return pos_map
Ejemplo n.º 19
0
 def run(self, args=""):
     tempFastqFile = os.path.join(self.getLocalTempDir(), "temp.fastq")
     normaliseQualValues(self.readFastqFile, tempFastqFile)
     system(
         "lastz %s[multiple] %s %s --format=sam > %s" %
         (self.referenceFastaFile, tempFastqFile, args, self.outputSamFile))
     try:
         pysam.Samfile(self.outputSamFile, "r").close()
     except ValueError:
         #Hack to make lastz work, creating SQ lines when no alignments are found
         fH = open(self.outputSamFile, 'a')
         for name, seq in fastaRead(open(self.referenceFastaFile, 'r')):
             fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq)))
         fH.close()
Ejemplo n.º 20
0
def main():
    ##########################################
    #Construct the arguments.
    ##########################################    
    
    usage = "usage: %prog [options] <fasta input file>\n\n" + \
            "    <fasta file>:  fasta sequence to check for unique headers\n"
    description = "Ensure sequence names are unique\n" 
    parser = OptionParser(usage=usage, description=description)
    
    parser.add_option("--checkAlphaNumeric", dest="checkAlphaNumeric", action="store_true",
                      help="Checks that the first word contains only alphanumeric characters, periods or underscores.",
                      default=False)

    parser.add_option("--checkUCSCNames", dest="checkUCSC", action="store_true",
                      help="Checks that suffix of the first word after the last '.' character contains only alpha-numeric characters or underscores and is unique. This is useful if exporting to MAF, where sequences are named 'genome.chr'.",
                      default=False)

    parser.add_option("--checkAssemblyHub", dest="checkAssemblyHub",
                      action="store_true", help="Checks that the first word "
                      "of each header is able to be used in a UCSC Assembly "
                      "Hub.")

    options, args = parser.parse_args()
    
    if len(args) != 1:
        parser.print_help()
        return 1
    
    inputName = args[0]
    inputFile = open(inputName, "r")
     
    seen = set()
    for header, seq in fastaRead(inputFile):
        mungedHeader = header.split()[0]
        if options.checkAlphaNumeric and "".join([ i for i in mungedHeader if str.isalnum(i) ]) != mungedHeader: #Check is only alpha numeric
            raise RuntimeError("We found a non-alpha numeric character in the fasta header, and the config file (checkAlphaNumeric option) demands that all fasta headers be alpha numeric: %s" % header)
        if options.checkUCSC:
            mungedHeader = mungedHeader.split('.')[-1]
            if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':') ]) != mungedHeader:
                raise RuntimeError("We found a non-alpha numeric, '-', ':' or '_' prefix in the fasta header (UCSC Names option), please modify the first word after the '>' and after the last '.' in every fasta header to only contain alpha-numeric, '_', ':' or '-' characters, or consider using a more lenient option like --checkForAssemblyHub. The offending header: %s" % header)
        if options.checkAssemblyHub:
            if "".join([ i for i in mungedHeader if (str.isalnum(i) or i == '_' or i == '-' or i == ':' or i == ".") ]) != mungedHeader:
                raise RuntimeError("An invalid character was found in the first word of a fasta header. Acceptable characters for headers in an assembly hub include alphanumeric characters plus '_', '-', ':', and '.'. Please modify your headers to eliminate other characters. The offending header: %s" % header)
        if mungedHeader in seen:
            raise RuntimeError("We found a duplicated fasta header, the first word of each fasta header should be unique within each genome, as this is a requirement for the output HAL file or any MAF file subsequently created. Please modify the input fasta file. Offending duplicate header: %s" % header)
        seen.add(mungedHeader)
    inputFile.close()
    return 0
Ejemplo n.º 21
0
def build_pos_map():
    # build a map of alignment positions to sequence positions
    r = {
        name: seq
        for name, seq in fastaRead(
            "/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta")
    }
    r_sort = sorted(r.iteritems(), key=lambda x: x[0])
    names, seqs = zip(*r_sort)
    tgt_is = {n: 0 for n in names}
    pos_map = defaultdict(dict)
    for ref_i, cs in enumerate(zip(*seqs)):
        for name, tgt_i in tgt_is.iteritems():
            pos_map[name][ref_i] = tgt_i
        for name, c in zip(*[names, cs]):
            if c != "-":
                tgt_is[name] += 1
    return pos_map
def main():
    ##########################################
    #Construct the arguments.
    ##########################################

    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to filter\n"
    description = "Ensure sequences have length >= length\n"

    parser = OptionParser(usage=usage, description=description)

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="only filter sequences with prefix in name",
                      default="")
    parser.add_option("--length",
                      dest="length",
                      type="int",
                      help="filter shorter than length [default=1000]",
                      default=1000)

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.print_help()
        return 1

    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")

    contTable = containedSequences(inputFile)
    inputFile.seek(0)

    for header, seq in fastaRead(inputFile):
        if tooShort(header, seq, options, contTable) == False:
            fastaWrite(outputFile, header, seq)

    outputFile.close()
    inputFile.close()
    return 0
Ejemplo n.º 23
0
 def testFastaReadWrite(self):
     tempFile = getTempFile()
     self.tempFiles.append(tempFile)
     for test in range(0, self.testNo):
         fastaNumber = random.choice(range(10))
         l = [getRandomSequence() for i in range(fastaNumber)]
         fileHandle = open(tempFile, 'w')
         for name, seq in l:
             fastaWrite(fileHandle, name, seq)
         fileHandle.close()
         fileHandle = open(tempFile, 'r')
         l.reverse()
         outFh = io.StringIO()
         for i in fastaRead(fileHandle):
             assert i == l.pop()
             name, seq = i
             fastaWrite(outFh, name, seq)
         outFh.close()
         fileHandle.close()
Ejemplo n.º 24
0
def checkUniqueHeaders(inputFile,
                       checkAlphaNumeric=False,
                       checkUCSC=False,
                       checkAssemblyHub=True):
    """Check that headers are unique and meet certain requirements."""
    seen = set()
    for header, seq in fastaRead(inputFile):
        if " " in header or "\t" in header:
            raise RuntimeError(
                "The fasta header '%s' contains spaces or tabs. These characters will cause issues in space-separated formats like MAF, and may not function properly when viewed in a browser. Please remove these characters from the input headers and try again."
                % header)
        mungedHeader = header.split()[0]
        if checkAlphaNumeric and "".join([
                i for i in mungedHeader if str.isalnum(i)
        ]) != mungedHeader:  #Check is only alpha numeric
            raise RuntimeError(
                "We found a non-alpha numeric character in the fasta header, and the config file (checkAlphaNumeric option) demands that all fasta headers be alpha numeric: %s"
                % header)
        if checkUCSC:
            mungedHeader = mungedHeader.split('.')[-1]
            if "".join([
                    i for i in mungedHeader
                    if (str.isalnum(i) or i == '_' or i == '-' or i == ':')
            ]) != mungedHeader:
                raise RuntimeError(
                    "We found a non-alpha numeric, '-', ':' or '_' prefix in the fasta header (UCSC Names option), please modify the first word after the '>' and after the last '.' in every fasta header to only contain alpha-numeric, '_', ':' or '-' characters, or consider using a more lenient option like --checkForAssemblyHub. The offending header: %s"
                    % header)
        if checkAssemblyHub:
            if "".join([
                    i for i in mungedHeader if (str.isalnum(i) or i == '_' or i
                                                == '-' or i == ':' or i == ".")
            ]) != mungedHeader:
                raise RuntimeError(
                    "An invalid character was found in the first word of a fasta header. Acceptable characters for headers in an assembly hub include alphanumeric characters plus '_', '-', ':', and '.'. Please modify your headers to eliminate other characters. The offending header: %s"
                    % header)
        if mungedHeader in seen:
            raise RuntimeError(
                "We found a duplicated fasta header, the first word of each fasta header should be unique within each genome, as this is a requirement for the output HAL file or any MAF file subsequently created. Please modify the input fasta file. Offending duplicate header: %s"
                % header)
        seen.add(mungedHeader)
Ejemplo n.º 25
0
def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None):
    """Gets inputs (based on Blanchette region 0) that have weird header names
    that might get parsed wrong and cause issues."""
    sequences, newickTreeString = getCactusInputs_blanchette(regionNumber=regionNumber)

    # Assign weird header names
    if tempDir is None:
        tempDir = getTempDir()
    # Should also consider "bar foo", "ba rfoo", but we currently
    # throw away everything but the first token (probably because of
    # cigar parsing).
    funkyHeaderNames = ['id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar']
    funkyIndex = 0
    for i, sequencePath in enumerate(sequences):
        newPath = os.path.join(tempDir, str(i))
        for _, sequence in fastaRead(sequencePath):
            header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)]
            funkyIndex += 1
            fastaWrite(newPath, header, sequence, 'a')
        sequences[i] = newPath

    return sequences, newickTreeString
def main():
    ##########################################
    #Construct the arguments.
    ##########################################    
    
    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to filter\n"
    description = "Ensure sequences have length >= length\n"
                    
    parser = OptionParser(usage=usage, description=description)
    
    parser.add_option("--prefix", dest="prefix", type="string",
                      help="only filter sequences with prefix in name",
                      default="")
    parser.add_option("--length", dest="length", type="int",
                      help="filter shorter than length [default=1000]",
                      default=1000)
    
    options, args = parser.parse_args()
    
    if len(args) != 2:
        parser.print_help()
        return 1
    
    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")
  
    contTable = containedSequences(inputFile)
    inputFile.seek(0)
  
    for header, seq in fastaRead(inputFile):
        if tooShort(header, seq, options, contTable) == False:
            fastaWrite(outputFile, header, seq)
      
    outputFile.close()
    inputFile.close()  
    return 0
def containedSequences(inputFile):
    lookup = dict()
    prev = ""
    for header, seq in fastaRead(inputFile):
        if '|1|' not in header:
            assert len(lookup) == 0
            return None
        else:
            idx = header.find('|1|') 
            name = header[:idx]
            offset = header[idx+3:]
            if offset.isdigit() == False:
                assert len(lookup) == 0
                return None
            if int(offset) == 0:
                assert lookup.has_key(name) == False
                lookup[name] = (len(seq), False)
            elif lookup.has_key(name) == True:
                lookup[name] = (max(lookup[name][0], int(offset) + len(seq)), lookup[name][1])
            if name != prev and lookup.has_key(prev):
                lookup[prev] = (lookup[prev][0], True)
            prev = name
    return lookup
Ejemplo n.º 28
0
def mutateReferenceSequences(referenceFastaFiles):
    updatedReferenceFastaFiles = referenceFastaFiles[:]
    for referenceFastaFile in referenceFastaFiles:
        if not "percent" in referenceFastaFile:
            mutation_rates = [0.01, 0.05, 0.10, 0.20]
            for mutation_rate in mutation_rates:
                indel_rate = 0.0 * mutation_rate # indel rate = 20% of Substitution rate
                i = mutation_rate * 100
                j = indel_rate * 100
                newReferenceFastaFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta"
                mutationIndexFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta_Index.txt"
                updatedReferenceFastaFiles.append(newReferenceFastaFile)
                if not os.path.exists(newReferenceFastaFile):
                    fH = open(newReferenceFastaFile, 'w')
                    fH2 = open(mutationIndexFile, 'w')
                    for header, seq in fastaRead(referenceFastaFile):
                        header = header.split()[0]
                        mutatedSeq = mutateSequence(seq, mutation_rate)
                        fastaWrite(fH, header, mutatedSeq)
                        fastaWrite(fH2, header, seq)
                        fastaWrite(fH2, header + "_mutated", mutatedSeq)
                    fH.close()
                    fH2.close()
    return updatedReferenceFastaFiles
def containedSequences(inputFile):
    lookup = dict()
    prev = ""
    for header, seq in fastaRead(inputFile):
        if '|1|' not in header:
            assert len(lookup) == 0
            return None
        else:
            idx = header.find('|1|')
            name = header[:idx]
            offset = header[idx + 3:]
            if offset.isdigit() == False:
                assert len(lookup) == 0
                return None
            if int(offset) == 0:
                assert (name in lookup) == False
                lookup[name] = (len(seq), False)
            elif (name in lookup) == True:
                lookup[name] = (max(lookup[name][0],
                                    int(offset) + len(seq)), lookup[name][1])
            if name != prev and prev in lookup:
                lookup[prev] = (lookup[prev][0], True)
            prev = name
    return lookup
def lengthWithoutGaps(seq):
    return len([i for i in seq if i != '-'])

if __name__ == '__main__':
    # Parse args
    if len(sys.argv) < 3:
        print __doc__
        sys.exit(1)

    newickPath = sys.argv[1]
    fastaPath = sys.argv[2]
    treeString = open(newickPath).read().split("\n")[0].strip()
    tree = NXNewick().parseString(treeString)
    
    sequences = {}
    for name, seq in fastaRead(open(fastaPath)):
        sequences[name] = seq
    
    # Print MAF, with sequence lines in post-order.
    print '##maf version=1 scoring=NA'
    print 'a tree="%s"' % (treeString)
    for nodeId in tree.postOrderTraversal():
        if not tree.isLeaf(nodeId):
            continue
        nodeName = tree.getName(nodeId)
        if nodeName not in sequences:
            raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName))
        seq = sequences[nodeName]
        seqLen = lengthWithoutGaps(seq)
        print 's %s 0 %d + %d %s' % (nodeName, seqLen, seqLen, seq)
    # mafValidator wants an empty closing line(?)
Ejemplo n.º 31
0
for exclude, region in regions.iteritems():
   for start, stop in region:
    raw_recs.extend([x for x in results[exclude] if start < int(x[1]) <= stop])


# region with poor alignment
exclude_regions = [[28574, 31093]]
exclude_regions = [ChromosomeInterval('a', x[0], x[1], '.') for x in exclude_regions]
recs = []
for r in raw_recs:
    i = ChromosomeInterval('a', int(r[1]), int(r[1]) + 1, '.')
    if not any([i.overlap(x) for x in exclude_regions]):
        recs.append(r)

# build a map of alignment positions to sequence positions
r = {name: seq for name, seq in fastaRead("stitched_alignment.fa")}
r_sort = sorted(r.iteritems(),key=lambda x: x[0])
names, seqs = zip(*r_sort)
tgt_is = {n: 0 for n in names}


pos_map = defaultdict(dict)

for ref_i, cs in enumerate(zip(*seqs)):
    for name, tgt_i in tgt_is.iteritems():
        pos_map[name][ref_i] = tgt_i
    for name, c in zip(*[names, cs]):
        if c != "-":
            tgt_is[name] += 1

import sys
from sonLib.bioio import fastaRead
speciesMap = {'humanZnfCluster':'human', 'chimpZnfCluster':'chimp',
              'gorillaZnfCluster':'gorilla', 'rhesusZnfCluster.fa':'rhesus',
              'orangZnfCluster':'orang'}

for header, _ in fastaRead(open(sys.argv[1])):
    name = header.split("_")[0]
    print "%s\t%s" % (header, speciesMap[name])
Ejemplo n.º 33
0
            header.start += len( subSequence ) + lenNs
        
        sequence = sequence[m.start() + lenNs: ]
        m = re.search( pattern, sequence )
    
    i = fn2(header, searchedSeq + sequence)
    if i != None:
        yield i

#=========== MAIN ====================
fH = open(sys.argv[1], 'r')
fH2 = open(sys.argv[2], 'w')
lengthOfNs = int(sys.argv[3])
lengthOfFragment = int(sys.argv[4])
if len(sys.argv) == 6:
    setLogLevel(sys.argv[5])

headers = set()
for name, sequence in fastaRead(fH):
    header = Header( name.split()[0], len(sequence) )
    logger.info("Got a sequence of length %i with header %s for processing" % (len(sequence), name.split()[0]))
    for newheader, subsequence in fn( header, sequence, lengthOfNs ):
        if len( subsequence ) > 0:
            logger.info("Writing out a sequence of length %i with header %s" % (len(subsequence), newheader))
            assert newheader not in headers
            headers.add(newheader)
            fastaWrite(fH2, newheader, subsequence)
        
fH.close()
fH2.close()
Ejemplo n.º 34
0
from sonLib.bioio import fastaRead

if "--help" in sys.argv[1:] or len(sys.argv) == 1:
    print "Script to create a bed file containing the intervals of repeat bases in a  fasta file."
    print "Usage: fastaFile outputBedFile"
    sys.exit(0)

def fn(header):
    return header.split()[0]

def fn2(sequence):
    fn = lambda x : x in [ 'a', 'c', 't', 'g', 'N', 'n']
    i = 0
    while i < len(sequence):
        if fn(sequence[i]):
            j = i+1
            while j<len(sequence) and fn(sequence[j]):
                j+=1
            yield i, j
            i = j
        else:
            i+=1

fileHandle = open(sys.argv[2], 'w')
for sequenceFile in sys.argv[1].split():
    for header, sequence in fastaRead(open(sequenceFile, 'r')):
        sequenceName = fn(header)
        for start, stop, in fn2(sequence):
            fileHandle.write("%s\t%i\t%i\n" % (sequenceName, start, stop))
fileHandle.close()
Ejemplo n.º 35
0
import sys
import xml.etree.ElementTree as ET
from sonLib.bioio import fastaRead, fastaWrite
node = ET.parse(sys.argv[1]).getroot()
fH = open(sys.argv[3], 'w')
seqs = [ i for i in fastaRead(open(sys.argv[2], 'r')) ]
assert(len(seqs) == 1)
for name, sequence in seqs:
    #>hg19.chr6.171115067.28377796.5150977.1
    i = name.split(".")
    j = int(node.attrib["minOtherReferenceCoordinate"])
    k = int(node.attrib["maxOtherReferenceCoordinate"])
    fastaWrite(fH, ".".join(i[0:3] + [ str(int(i[3]) + j), str(k - j)] + i[-1:]), sequence[j:k])
fH.close()
        read_map_holder[para][tag] = find_read_sun_intersections(reads, vcf_recs, bam_handle)



from sonLib.bioio import fastaRead
from collections import OrderedDict
# We need the start positions for each in the actual genome, used a browser BLAT
start_pos = {"Notch2": 120087516, "Notch2NL-A": 146248223, "Notch2NL-B": 148698969, "Notch2NL-C": 149374496, "Notch2NL-D": 120707775}
# which of these are backwards?
backwards = {"Notch2NL-C", "Notch2NL-D"}

names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C', 'Notch2NL-D'] # same as in VCF


# build a map of alignment positions to sequence positions
r = {name: seq for name, seq in fastaRead("/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta")}
r_sort = sorted(r.iteritems(),key=lambda x: x[0])
names, seqs = zip(*r_sort)
tgt_is = {n: 0 for n in names}


pos_map = defaultdict(dict)
for ref_i, cs in enumerate(zip(*seqs)):
    for name, tgt_i in tgt_is.iteritems():
        pos_map[name][ref_i] = tgt_i
    for name, c in zip(*[names, cs]):
        if c != "-":
            tgt_is[name] += 1


# invert pos_map
#!/usr/bin/env python
# Usage: pastaIdsToOriginalNames.py fastaFile renameFile
import sys
from sonLib.bioio import system, fastaRead, fastaWrite

fastaFile = sys.argv[1]
renameFile = sys.argv[2]

curRealName = None
curPastaID = None
translate = {}
for i, line in enumerate(open(renameFile)):
    line = line.strip()
    if i % 3 == 0:
        curPastaID = line
    elif i % 3 == 1:
        curRealName = line
    else:
        translate[curPastaID] = curRealName

for header, seq in fastaRead(open(fastaFile)):
    # hacks for if we are using the badly-named original fasta.
    header = translate[header].replace("...", ".-.").replace(".", "_").replace("__", "_")
    fastaWrite(sys.stdout, header, seq)
Ejemplo n.º 38
0
    return string.translate(s, rc)[::-1]


# We need the start positions for each in the actual genome, used a browser BLAT
start_pos = {"Notch2": 120087516, "Notch2NL-A": 146248223, "Notch2NL-B": 148698969, "Notch2NL-C": 149374496, "Notch2NL-D": 120707775}
# which of these are backwards?
backwards = {"Notch2NL-C", "Notch2NL-D"}

names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C', 'Notch2NL-D'] # same as in VCF
header = "##fileformat=VCFv4.1"
fields = "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + names)
rec_template = "Notch2\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\tGT\t{gts}\n"


# build a map of alignment positions to sequence positions
r = {name: seq for name, seq in fastaRead("notch2_aligned.fasta")}
r_sort = sorted(r.iteritems(),key=lambda x: x[0])
names, seqs = zip(*r_sort)
tgt_is = {n: 0 for n in names}


pos_map = defaultdict(dict)
for ref_i, cs in enumerate(zip(*seqs)):
    for name, tgt_i in tgt_is.iteritems():
        pos_map[name][ref_i] = tgt_i
    for name, c in zip(*[names, cs]):
        if c != "-":
            tgt_is[name] += 1


# now lets restructure the vcf to always make Notch2 the reference allele
def lengthWithoutGaps(seq):
    return len([i for i in seq if i != '-'])

if __name__ == '__main__':
    # Parse args
    if len(sys.argv) < 3:
        print __doc__
        sys.exit(1)

    newickPath = sys.argv[1]
    fastaPath = sys.argv[2]
    treeString = open(newickPath).read().split("\n")[0].strip()
    tree = NXNewick().parseString(treeString)
    
    sequences = {}
    for header, seq in fastaRead(open(fastaPath)):
        fields = header.split('_')
        name = fields[0]
        start = fields[1]
        end = fields[2]
        strand = fields[3]
        sequences[header] = (seq, name, start, end, strand)
    
    # Print MAF, with sequence lines in post-order.
    print '##maf version=1 scoring=NA'
    print 'a tree="%s"' % (treeString)
    for nodeId in tree.postOrderTraversal():
        if not tree.isLeaf(nodeId):
            continue
        nodeName = tree.getName(nodeId)
        if nodeName not in sequences:
#!/usr/bin/env python
import sys
from sonLib.bioio import fastaRead, fastaWrite
# Format: genome => sequence => subsequence, start of subsequence, end
# of subsequence
genomeMap = {"hg19":{"chr19":("humanZnfCluster", 51927367, 54158296)}, "panTro4":{"chr19":("chimpZnfCluster", 56310088, 58563166)},
             "gorGor3":{"chr19":("gorillaZnfCluster", 48765939, 51102984)}, "ponAbe2":{"chr19":("orangZnfCluster", 53063439, 55430961)},
             "rheMac3":{"chr19":("rhesusZnfCluster", 57314791, 59488909)}}

fasta = sys.argv[1]
renameFile = open(sys.argv[2], 'w')

for header, seq in fastaRead(open(fasta)):
    oldHeader = header
    fields = header.split("_")
    if len(fields) != 5:
        # some sequences have _'s in them (chrX_random_N)
        fields[0] = "_".join(fields[:len(fields)-4])
        fields = [field for i, field in enumerate(fields) if i == 0 or i > len(fields) - 5]
        assert len(fields) == 5
    chr = fields[0]
    start = int(fields[1])
    end = int(fields[2])
    strand = fields[3]
    genome = fields[4]
    if genome in genomeMap:
        if chr in genomeMap[genome]:
            subseq = genomeMap[genome][chr][0]
            subseqStart = genomeMap[genome][chr][1]
            subseqEnd = genomeMap[genome][chr][2]
            if start < subseqStart or end >= subseqEnd:
Ejemplo n.º 41
0
    def progressiveFunction(self,
                            experimentFile,
                            toilDir,
                            batchSystem,
                            buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir,
                                          "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(
                map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' %
                                 (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(
                                header in headers[genome],
                                'Header %s from output c2h %s not found in input fa %s'
                                ' for genome %s' %
                                (header, c2hPath, seqMap[genome], genome))

        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
Ejemplo n.º 42
0
from sonLib.bioio import fastaRead, fastaWrite
import sys
import random
fH = open(sys.argv[2], "w")
def fn(k, i, j):
    if k.upper() == i.upper():
        l = random.choice(j)
        if k == k.upper():
            return l.upper()
        return l.lower()
    else:
        return k
for name, seq in fastaRead(open(sys.argv[1], "r")):
    for i, j in [ ("W", ("A", "T")),
                 ("S", ("C", "G")),
                 ("M", ("A", "C")),
                 ("K", ("G", "T")),
                 ("R", ("A", "G")),
                 ("Y", ("C", "T")),
                 ("B", ("C", "G", "T")),
                 ("D", ("A", "G", "T")),
                 ("H", ("A", "C", "T")),
                 ("V", ("A", "C", "G")) ]:
        seq = "".join([ fn(k, i, j) for k in seq ])
    fastaWrite(fH, name, seq)
fH.close()
Ejemplo n.º 43
0
import sys
import xml.etree.ElementTree as ET
from sonLib.bioio import fastaRead, fastaWrite
node = ET.parse(sys.argv[1]).getroot()
fH = open(sys.argv[3], 'w')
seqs = [i for i in fastaRead(open(sys.argv[2], 'r'))]
assert (len(seqs) == 1)
for name, sequence in seqs:
    #>hg19.chr6.171115067.28377796.5150977.1
    i = name.split(".")
    j = int(node.attrib["minOtherReferenceCoordinate"])
    k = int(node.attrib["maxOtherReferenceCoordinate"])
    fastaWrite(fH,
               ".".join(i[0:3] +
                        [str(int(i[3]) + j), str(k - j)] + i[-1:]),
               sequence[j:k])
fH.close()
Ejemplo n.º 44
0
import sys
import xml.etree.ElementTree as ET
from sonLib.bioio import fastaRead, fastaWrite
i = set([ i for i in ET.parse(sys.argv[1]).getroot().text.split() ])
fH = open(sys.argv[3], 'w')
for name, sequence in fastaRead(open(sys.argv[2], 'r')):
        if name not in i:
            fastaWrite(fH, name, sequence)
fH.close()
Ejemplo n.º 45
0
    "Notch2NL-C": 149374496,
    "Notch2NL-D": 120707775
}
# which of these are backwards?
backwards = {"Notch2NL-C", "Notch2NL-D"}

names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C',
         'Notch2NL-D']  # same as in VCF
header = "##fileformat=VCFv4.1"
fields = "\t".join(
    ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] +
    names)
rec_template = "Notch2\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\tGT\t{gts}\n"

# build a map of alignment positions to sequence positions
r = {name: seq for name, seq in fastaRead("notch2_aligned.fasta")}
r_sort = sorted(r.iteritems(), key=lambda x: x[0])
names, seqs = zip(*r_sort)
tgt_is = {n: 0 for n in names}

pos_map = defaultdict(dict)
for ref_i, cs in enumerate(zip(*seqs)):
    for name, tgt_i in tgt_is.iteritems():
        pos_map[name][ref_i] = tgt_i
    for name, c in zip(*[names, cs]):
        if c != "-":
            tgt_is[name] += 1

# now lets restructure the vcf to always make Notch2 the reference allele
# as well as turning it into a homozygous call, and removing the depth.
# finally, we filter for unique calls
Ejemplo n.º 46
0
from sonLib.bioio import fastaRead, fastaWrite
import sys
import random
fH = open(sys.argv[2], "w")


def fn(k, i, j):
    if k.upper() == i.upper():
        l = random.choice(j)
        if k == k.upper():
            return l.upper()
        return l.lower()
    else:
        return k


for name, seq in fastaRead(open(sys.argv[1], "r")):
    for i, j in [("W", ("A", "T")), ("S", ("C", "G")), ("M", ("A", "C")),
                 ("K", ("G", "T")), ("R", ("A", "G")), ("Y", ("C", "T")),
                 ("B", ("C", "G", "T")), ("D", ("A", "G", "T")),
                 ("H", ("A", "C", "T")), ("V", ("A", "C", "G"))]:
        seq = "".join([fn(k, i, j) for k in seq])
    fastaWrite(fH, name, seq)
fH.close()
Ejemplo n.º 47
0
    "Notch2": 120087516,
    "Notch2NL-A": 146248223,
    "Notch2NL-B": 148698969,
    "Notch2NL-C": 149374496,
    "Notch2NL-D": 120707775
}
# which of these are backwards?
backwards = {"Notch2NL-C", "Notch2NL-D"}

names = ['Notch2', 'Notch2NL-A', 'Notch2NL-B', 'Notch2NL-C',
         'Notch2NL-D']  # same as in VCF

# build a map of alignment positions to sequence positions
r = {
    name: seq
    for name, seq in fastaRead(
        "/hive/users/ifiddes/notch2nl_suns/notch2_aligned.fasta")
}
r_sort = sorted(r.iteritems(), key=lambda x: x[0])
names, seqs = zip(*r_sort)
tgt_is = {n: 0 for n in names}

pos_map = defaultdict(dict)
for ref_i, cs in enumerate(zip(*seqs)):
    for name, tgt_i in tgt_is.iteritems():
        pos_map[name][ref_i] = tgt_i
    for name, c in zip(*[names, cs]):
        if c != "-":
            tgt_is[name] += 1

# invert pos_map
pos_map_inverted = defaultdict(dict)
Ejemplo n.º 48
0
def main():
    ##########################################
    #Construct the arguments.
    ##########################################

    usage = "usage: %prog [options] <fasta input file>\n\n" + \
            "    <fasta file>:  fasta sequence to check for unique headers\n"
    description = "Ensure sequence names are unique\n"
    parser = OptionParser(usage=usage, description=description)

    parser.add_option(
        "--checkAlphaNumeric",
        dest="checkAlphaNumeric",
        action="store_true",
        help=
        "Checks that the first word contains only alphanumeric characters, periods or underscores.",
        default=False)

    parser.add_option(
        "--checkUCSCNames",
        dest="checkUCSC",
        action="store_true",
        help=
        "Checks that suffix of the first word after the last '.' character contains only alpha-numeric characters or underscores and is unique. This is useful if exporting to MAF, where sequences are named 'genome.chr'.",
        default=False)

    parser.add_option("--checkAssemblyHub",
                      dest="checkAssemblyHub",
                      action="store_true",
                      help="Checks that the first word "
                      "of each header is able to be used in a UCSC Assembly "
                      "Hub.")

    options, args = parser.parse_args()

    if len(args) != 1:
        parser.print_help()
        return 1

    inputName = args[0]
    inputFile = open(inputName, "r")

    seen = set()
    for header, seq in fastaRead(inputFile):
        mungedHeader = header.split()[0]
        if options.checkAlphaNumeric and "".join([
                i for i in mungedHeader if str.isalnum(i)
        ]) != mungedHeader:  #Check is only alpha numeric
            raise RuntimeError(
                "We found a non-alpha numeric character in the fasta header, and the config file (checkAlphaNumeric option) demands that all fasta headers be alpha numeric: %s"
                % header)
        if options.checkUCSC:
            mungedHeader = mungedHeader.split('.')[-1]
            if "".join([
                    i for i in mungedHeader
                    if (str.isalnum(i) or i == '_' or i == '-' or i == ':')
            ]) != mungedHeader:
                raise RuntimeError(
                    "We found a non-alpha numeric, '-', ':' or '_' prefix in the fasta header (UCSC Names option), please modify the first word after the '>' and after the last '.' in every fasta header to only contain alpha-numeric, '_', ':' or '-' characters, or consider using a more lenient option like --checkForAssemblyHub. The offending header: %s"
                    % header)
        if options.checkAssemblyHub:
            if "".join([
                    i for i in mungedHeader if (str.isalnum(i) or i == '_' or i
                                                == '-' or i == ':' or i == ".")
            ]) != mungedHeader:
                raise RuntimeError(
                    "An invalid character was found in the first word of a fasta header. Acceptable characters for headers in an assembly hub include alphanumeric characters plus '_', '-', ':', and '.'. Please modify your headers to eliminate other characters. The offending header: %s"
                    % header)
        if mungedHeader in seen:
            raise RuntimeError(
                "We found a duplicated fasta header, the first word of each fasta header should be unique within each genome, as this is a requirement for the output HAL file or any MAF file subsequently created. Please modify the input fasta file. Offending duplicate header: %s"
                % header)
        seen.add(mungedHeader)
    inputFile.close()
    return 0