Exemple #1
0
def ReadReferenceFromFile(File):
  '''Read in all sequences in the reference file; check there is only one.'''
  AllSequences, ReferenceLength = ReadSequencesFromFile(File,False)
  if len(AllSequences) != 1:
    print('Found', len(AllSequences), 'sequences in', ReferenceFile+\
    '; expected 1.\nQuitting.', file=sys.stderr)
    exit(1)
  return AllSequences.items(), ReferenceLength
GapChars = ['-','.','?']

# Check this file is called from the command line with one argument
if len(sys.argv[1:]) != 1:
  print 'Incorrect number of arguments given.'
  print 'Usage:\n', sys.argv[0], 'NameOfYourFastaFile.fasta'
  exit(1)
DataFile = sys.argv[1]

# Check that the argument exists and is a file
if not os.path.isfile(DataFile):
  print DataFile, 'does not exist or is not a file.'
  exit(1)

# Read in all the sequences as a dictionary
AllSequences, SequenceLength = ReadSequencesFromFile(DataFile)

SequenceNames = AllSequences.keys()
sequences     = AllSequences.values()
NumSequences  = len(AllSequences)

DataFromAllPositions = []
SetOfAllBasesEncountered = []

# For brevity
acgt = ['A','C','G','T']


AllExpectedBases = acgt + IUPACdict.keys() + GapChars

        for GapChar in GapChars:
            if GapChar in PrimerList[i]:
                print('SubSeq', PrimerList[i], 'contains a gap. This is unexpected.'+\
                '\nQuitting.', file=sys.stderr)
                exit(1)
    CounterObject = collections.Counter(PrimerList)
    DuplicatedPrimers = [i for i in CounterObject if CounterObject[i] > 1]
    if len(DuplicatedPrimers) != 0:
        for DuplicatedPrimer in DuplicatedPrimers:
            print('SubSeq', DuplicatedPrimer, 'was specified twice with the same',\
            'option.', file=sys.stderr)
        print('Quitting.', file=sys.stderr)
        exit(1)

# Read in the sequences from the alignment file (into a dictionary)
SeqDict, AlignmentLength = ReadSequencesFromFile(AlignmentFile)

# Check the chosen reference is in the alignment
if not ChosenRef in SeqDict:
    print('Could not find',
          ChosenRef,
          'in',
          AlignmentFile + '.\nQuitting.',
          file=sys.stderr)
    exit(1)
ChosenRefSeq = SeqDict[ChosenRef]

# Define the set of unique primers, i.e. StartPrimers+EndPrimers but not
# double counting those that appear in both. Record their lengths in a dict.
AllUniquePrimers = StartPrimers + \
[primer for primer in EndPrimers if not primer in StartPrimers]
Exemple #4
0
DuplicatedContigNames = [i for i in CounterObject if CounterObject[i]>1]
if len(DuplicatedContigNames) != 0:
  for ContigName in DuplicatedContigNames:
    print('Contig name', ContigName, 'was duplicated in the arguments.', \
    file=sys.stderr)
  print('All contig names should be unique. Exiting.', file=sys.stderr)
  exit(1)

# Check the consensus name does not match one fo the contig names.
if ConsensusName != None and ConsensusName in ContigNames:
  print('The consensus name should not be the same as one of the contig', \
  'names. Quitting.', file=sys.stderr)
  exit(1)

# Read in the sequences from the alignment file (into a dictionary)
AllSeqsDict, AlignmentLength = ReadSequencesFromFile(AlignmentFile)

# Check the consensus is found
if ConsensusName != None:
  if not ConsensusName in AllSeqsDict:
    print(ConsensusName, 'not found in', AlignmentFile + '. Quitting.', \
    file=sys.stderr)
    exit(1)
  ConsensusSeq = AllSeqsDict[ConsensusName]

# Separate sequences into references and contigs
RefDict = {}
ContigDict = {}
for SeqName in AllSeqsDict:
  if SeqName in ContigNames:
    ContigDict[SeqName] = AllSeqsDict[SeqName]
# Check this file is called from the command line with one argument
if len(sys.argv[1:]) != 1:
  print 'Incorrect number of arguments given.'
  print 'Usage:\n', sys.argv[0], 'NameOfYourFastaFile.fasta'
  exit(1)
DataFile = sys.argv[1]

# Check that the argument exists and is a file
if not os.path.isfile(DataFile):
  print DataFile, 'does not exist or is not a file.'
  exit(1)

# Read in the sequences as a dictionary. They are (nominally) not aligned.
Aligned=False
SeqDict, FirstSeqLength = ReadSequencesFromFile(DataFile, Aligned)

# We are expecting two sequences.
if len(SeqDict) != 2:
  print 'Expected 2 sequences;', DataFile, 'contains', str(len(SeqDict)) +\
  '.\nQuitting.'
  exit(1)

SeqNames = SeqDict.keys()
Seqs     = SeqDict.values()

# If the two sequences are the same length, there is no shuffling to do. Print
# them as they are.
if len(Seqs[0]) == len(Seqs[1]):
  for SeqName, seq in SeqDict.items():
    print '>'+SeqName
Exemple #6
0
    'specified.', file=sys.stderr)
    exit(1)

# Rename arguments for brevity / clarity.
MainAlnFile = args.MainAlignmentFile
PairedAlnFile = args.PairedAlignmentFile
ExciseUniqueInsertionsOfRefInMainAlignment = args.excise

# Check that the arguments exist and are files
for InputFile in [MainAlnFile, PairedAlnFile]:
    if not os.path.isfile(InputFile):
        print(InputFile, 'does not exist or is not a file.', file=sys.stderr)
        exit(1)

# Read in the sequences from the main alignment file (into a dictionary)
MainAlnSeqDict, MainAlnSeqLength = ReadSequencesFromFile(MainAlnFile)
MainAlnSeqNames = MainAlnSeqDict.keys()
MainAlnSeqs = MainAlnSeqDict.values()

# Read in the sequences from the paired alignment file
PairedAlnSeqDict, PairedAlnSeqLength = ReadSequencesFromFile(PairedAlnFile)

# Check it has two sequences
if len(PairedAlnSeqDict) != 2:
    print('File', PairedAlnFile, 'contains', len(PairedAlnSeqDict),\
    'sequences; two were expected.\nQuitting.', file=sys.stderr)
    exit(1)
Seq1name, Seq2name = PairedAlnSeqDict.keys()

# Check that one of the sequences is in the main alignment file (the 'Ref')
# and one is not (the 'SeqToAdd').
    for GapChar in GapChars:
      if GapChar in PrimerList[i]:
        print 'Primer', PrimerList[i], 'contains a gap. This is unexpected.'+\
        '\nQuitting.'
        exit(1)
  CounterObject = collections.Counter(PrimerList)
  DuplicatedPrimers = [i for i in CounterObject if CounterObject[i]>1]
  if len(DuplicatedPrimers) != 0:
    for DuplicatedPrimer in DuplicatedPrimers:
      print 'Primer', DuplicatedPrimer, 'was specified twice with the same',\
      'option.'
    print 'Quitting.'
    exit(1)

# Read in the sequences from the alignment file (into a dictionary)
SeqDict, AlignmentLength = ReadSequencesFromFile(AlignmentFile)

# Check the chosen reference is in the alignment
if not ChosenRef in SeqDict:
  print 'Could not find', ChosenRef, 'in', AlignmentFile+'.\nQuitting.'
  exit(1)
ChosenRefSeq = SeqDict[ChosenRef]

# Define the set of unique primers, i.e. StartPrimers+EndPrimers but not 
# double counting those that appear in both. Record their lengths in a dict.
AllUniquePrimers = StartPrimers + \
[primer for primer in EndPrimers if not primer in StartPrimers]
NumUniquePrimers = len(AllUniquePrimers)
PrimerLengths = {primer : len(primer) for primer in AllUniquePrimers} 

# Finds the position in the alignment, for each primer, after