Exemple #1
0
def writeExtensions(genome, extensions):
    """
  genome: The genome as a string.
  extensions: A dictionary mapping genes(Iteration objects) to alternative locations where that gene could start.
  
  This function will write the translation of each possible extension to the file, "extensions.fas".
  """
    output = open("extensions.fas", "w")
    q = 0
    for gene, extensionList in extensions.items():
        for extension in extensionList:
            q += 1
            if gene.location[0] < gene.location[1]:
                ext = extension + 1
                proteins = utils.translate(genome[extension:gene.location[1]])
            else:
                ext = extension
                proteins = utils.translate(
                    utils.reverseComplement(genome[gene.location[1] -
                                                   1:extension]))
            output.write(">" + gene.query + "~" + str(q) + ":" +
                         "-".join(map(str, [ext, gene.location[1]])) + "\n")
            for i in xrange(0, len(proteins), 50):
                output.write(proteins[i:min(i + 50, len(proteins))] + "\n")

    output.close()
Exemple #2
0
def findIntergenics(query, genes, name, minLength, blast, database, eValue, pipeline):
  """
  query:     File name of the fasta file.
  genes:     A dictionary that maps query names to Iteration objects
  name:      Name of the genome.
  minLength: Minimum length of any intergenic genes.
  blast:     Location of the installation of blast.
  database:  The database to use with blast.
  eValue:    The e value to use with blast.

  return:    A dictionary that maps query names to Iterations objects, only contains intergenic genes.
  
  Searches for intergenic genes within a genome.  First, all the intergenic regions in the genome are calculated and
  any potential genes in those regions area extracted and written to "intergenics.fas".  This file is then blasted.
  Then the genes in the result of this blast are pruned so that only one intergenic gene may stop at any one
  location.  Finally, the remaining genes are flagged as intergenic and returned.
  """
  genome = utils.loadGenome(query)
  reverseComplementGenome = utils.reverseComplement(genome)
  openForwardLocations, openReverseLocations = calculateIntergenicRegions(len(genome), genes.values(), minLength)
  
  potentialGenes = findPotentialGenes(genome, openForwardLocations, minLength)
  reversePotentialGenes = findPotentialGenes(reverseComplementGenome, openReverseLocations, minLength)
  potentialGenes += map(lambda x: (len(genome)-x[0], len(genome)-x[1]), reversePotentialGenes)
  
  writePotentialGenes(genome, potentialGenes)
  
  result = utils.cachedBlast("intergenicBlasts/" + name + ".blastp.xml", blast, database, eValue, "intergenics.fas", pipeline)
  os.remove("intergenics.fas")
  result = removeCommonStops(result)
  for r in result.values():
    r.intergenic = True
    r.note = "Intergenic"
    r.color = "160 32 240"
  return result
Exemple #3
0
def getExtensions(genome, genes):
    """
  genome: The genome as a string.
  genes:  A list of Iteration objects.

  return: A dictionary mapping genes(Iteration objects) to alternative locations where that gene could start.
  
  The alternate starts are calculated by starting at the original start of the gene and iterating backwards.
  When a start codon is found the start of that start codon is added to the list of alternate starts.  If this start
  codon comes before the start of the previous gene then is it still added to the list but the search terminates.
  """
    forwardStops, reverseStops = getStops(genes)
    forwardStops.append(1)
    reverseStops.append(len(genome))
    results = {}
    for gene in genes:
        results[gene] = []
        if gene.location[0] < gene.location[1]:
            bound = max(filter(lambda x: x < gene.location[1], forwardStops))
            for i in xrange(gene.location[0] - 1, 0, -3):
                if genome[i - 3:i] in utils.startCodons:
                    results[gene].append(i - 3)
                    if i <= bound - 1:
                        break
                elif genome[i - 3:i] in utils.stopCodons:
                    break
        else:
            bound = min(filter(lambda x: x > gene.location[1], reverseStops))
            for i in xrange(gene.location[0], len(genome), 3):
                if utils.reverseComplement(genome[i:i +
                                                  3]) in utils.startCodons:
                    results[gene].append(i + 3)
                    if i >= bound - 1:
                        break
                elif utils.reverseComplement(genome[i:i +
                                                    3]) in utils.stopCodons:
                    break
    return results
Exemple #4
0
def getExtensions(genome, genes):
  """
  genome: The genome as a string.
  genes:  A list of Iteration objects.

  return: A dictionary mapping genes(Iteration objects) to alternative locations where that gene could start.
  
  The alternate starts are calculated by starting at the original start of the gene and iterating backwards.
  When a start codon is found the start of that start codon is added to the list of alternate starts.  If this start
  codon comes before the start of the previous gene then is it still added to the list but the search terminates.
  """
  forwardStops, reverseStops = getStops(genes)
  forwardStops.append(1)
  reverseStops.append(len(genome))
  results = {}
  for gene in genes:
    results[gene] = []
    if gene.location[0] < gene.location[1]:
      bound = max(filter(lambda x: x < gene.location[1], forwardStops))
      for i in xrange(gene.location[0]-1, 0, -3):
        if genome[i-3:i] in utils.startCodons:
          results[gene].append(i-3)
          if i <= bound-1:
            break
        elif genome[i-3:i] in utils.stopCodons:
          break
    else:
      bound = min(filter(lambda x: x > gene.location[1], reverseStops))
      for i in xrange(gene.location[0], len(genome), 3):
        if utils.reverseComplement(genome[i:i+3]) in utils.startCodons:
          results[gene].append(i+3)
          if i >= bound-1:
            break
        elif utils.reverseComplement(genome[i:i+3]) in utils.stopCodons:
          break
  return results
Exemple #5
0
def findPromoters(query, name, scoreCutoff, frame):
  """
  query: Name of the query file.
  name: Name of the genome.
  scoreCutoff: Minimum promoter score value for any promoters.
  frame: A JFrame that may be used as the parent for a JDialog to display messages.  If it is none then messages
         are just printed.

  return: A list of Promoter objects for the forward and reverse strands.
  
  This function uses BPROM to predict promoters and parses the results into the list of Promoter objects
  that are returned. Promoters with a score lower than scoreCutoff are filtered out.
  """
  genome = utils.loadGenome(query)
  forwardResults = cachedBPROM(genome, "promoterPredictions/" + name + ".forward.bprom", frame)
  reverseResults = cachedBPROM(utils.reverseComplement(genome), "promoterPredictions/" + name + ".reverse.bprom", frame)
  reverseResults = map(functools.partial(reverseCoordinates, len(genome)), reverseResults)
  return filter(lambda x: x.score > scoreCutoff, forwardResults + reverseResults)
Exemple #6
0
def findIntergenics(query, genes, name, minLength, blast, database, eValue,
                    pipeline):
    """
  query:     File name of the fasta file.
  genes:     A dictionary that maps query names to Iteration objects
  name:      Name of the genome.
  minLength: Minimum length of any intergenic genes.
  blast:     Location of the installation of blast.
  database:  The database to use with blast.
  eValue:    The e value to use with blast.

  return:    A dictionary that maps query names to Iterations objects, only contains intergenic genes.
  
  Searches for intergenic genes within a genome.  First, all the intergenic regions in the genome are calculated and
  any potential genes in those regions area extracted and written to "intergenics.fas".  This file is then blasted.
  Then the genes in the result of this blast are pruned so that only one intergenic gene may stop at any one
  location.  Finally, the remaining genes are flagged as intergenic and returned.
  """
    genome = utils.loadGenome(query)
    reverseComplementGenome = utils.reverseComplement(genome)
    openForwardLocations, openReverseLocations = calculateIntergenicRegions(
        len(genome), genes.values(), minLength)

    potentialGenes = findPotentialGenes(genome, openForwardLocations,
                                        minLength)
    reversePotentialGenes = findPotentialGenes(reverseComplementGenome,
                                               openReverseLocations, minLength)
    potentialGenes += map(lambda x: (len(genome) - x[0], len(genome) - x[1]),
                          reversePotentialGenes)

    writePotentialGenes(genome, potentialGenes)

    result = utils.cachedBlast("intergenicBlasts/" + name + ".blastp.xml",
                               blast, database, eValue, "intergenics.fas",
                               pipeline)
    os.remove("intergenics.fas")
    result = removeCommonStops(result)
    for r in result.values():
        r.intergenic = True
        r.note = "Intergenic"
        r.color = "160 32 240"
    return result
Exemple #7
0
def writePotentialGenes(genome, locations):
  """
  genome:    The genome as a string.
  locations: A list of 2-tuples representing the locations of genes in string coordinates(first nucleotide is at zero and the ending index is exclusive).
  
  Writes all the genes in genome listed locations to "intergenic.fas".  The written
  headers will contain fasta coordinates(first nucleotide is at one and the ending index is inclusive).
  """
  output = open("intergenics.fas", "w")
  q = 0
  for location in locations:
    q += 1
    if location[0] < location[1]:
      output.write(">intergenic~" + str(q) + ":" + str(location[0]+1) + "-" + str(location[1]) + "\n")
      proteins = utils.translate(genome[location[0]:location[1]])
    else:
      output.write(">intergenic~" + str(q) + ":" + str(location[0]) + "-" + str(location[1]+1) + "\n")
      proteins = utils.translate(utils.reverseComplement(genome[location[1]:location[0]]))
    for i in xrange(0, len(proteins), 50):
      output.write(proteins[i:min(i+50, len(proteins))] + "\n")
  output.close()
Exemple #8
0
def writeExtensions(genome, extensions):
  """
  genome: The genome as a string.
  extensions: A dictionary mapping genes(Iteration objects) to alternative locations where that gene could start.
  
  This function will write the translation of each possible extension to the file, "extensions.fas".
  """
  output = open("extensions.fas", "w")
  q = 0
  for gene, extensionList in extensions.items():
    for extension in extensionList:
      q += 1
      if gene.location[0] < gene.location[1]:
        ext = extension+1
        proteins = utils.translate(genome[extension:gene.location[1]])
      else:
        ext = extension
        proteins = utils.translate(utils.reverseComplement(genome[gene.location[1]-1:extension]))
      output.write(">" + gene.query + "~" + str(q) + ":" +
                   "-".join(map(str, [ext, gene.location[1]])) + "\n")
      for i in xrange(0, len(proteins), 50):
        output.write(proteins[i:min(i+50, len(proteins))] + "\n")
      
  output.close()
Exemple #9
0
def writePotentialGenes(genome, locations):
    """
  genome:    The genome as a string.
  locations: A list of 2-tuples representing the locations of genes in string coordinates(first nucleotide is at zero and the ending index is exclusive).
  
  Writes all the genes in genome listed locations to "intergenic.fas".  The written
  headers will contain fasta coordinates(first nucleotide is at one and the ending index is inclusive).
  """
    output = open("intergenics.fas", "w")
    q = 0
    for location in locations:
        q += 1
        if location[0] < location[1]:
            output.write(">intergenic~" + str(q) + ":" + str(location[0] + 1) +
                         "-" + str(location[1]) + "\n")
            proteins = utils.translate(genome[location[0]:location[1]])
        else:
            output.write(">intergenic~" + str(q) + ":" + str(location[0]) +
                         "-" + str(location[1] + 1) + "\n")
            proteins = utils.translate(
                utils.reverseComplement(genome[location[1]:location[0]]))
        for i in xrange(0, len(proteins), 50):
            output.write(proteins[i:min(i + 50, len(proteins))] + "\n")
    output.close()