def ReadReferenceFromFile(File): '''Read in all sequences in the reference file; check there is only one.''' AllSequences, ReferenceLength = ReadSequencesFromFile(File,False) if len(AllSequences) != 1: print('Found', len(AllSequences), 'sequences in', ReferenceFile+\ '; expected 1.\nQuitting.', file=sys.stderr) exit(1) return AllSequences.items(), ReferenceLength
GapChars = ['-','.','?'] # Check this file is called from the command line with one argument if len(sys.argv[1:]) != 1: print 'Incorrect number of arguments given.' print 'Usage:\n', sys.argv[0], 'NameOfYourFastaFile.fasta' exit(1) DataFile = sys.argv[1] # Check that the argument exists and is a file if not os.path.isfile(DataFile): print DataFile, 'does not exist or is not a file.' exit(1) # Read in all the sequences as a dictionary AllSequences, SequenceLength = ReadSequencesFromFile(DataFile) SequenceNames = AllSequences.keys() sequences = AllSequences.values() NumSequences = len(AllSequences) DataFromAllPositions = [] SetOfAllBasesEncountered = [] # For brevity acgt = ['A','C','G','T'] AllExpectedBases = acgt + IUPACdict.keys() + GapChars
for GapChar in GapChars: if GapChar in PrimerList[i]: print('SubSeq', PrimerList[i], 'contains a gap. This is unexpected.'+\ '\nQuitting.', file=sys.stderr) exit(1) CounterObject = collections.Counter(PrimerList) DuplicatedPrimers = [i for i in CounterObject if CounterObject[i] > 1] if len(DuplicatedPrimers) != 0: for DuplicatedPrimer in DuplicatedPrimers: print('SubSeq', DuplicatedPrimer, 'was specified twice with the same',\ 'option.', file=sys.stderr) print('Quitting.', file=sys.stderr) exit(1) # Read in the sequences from the alignment file (into a dictionary) SeqDict, AlignmentLength = ReadSequencesFromFile(AlignmentFile) # Check the chosen reference is in the alignment if not ChosenRef in SeqDict: print('Could not find', ChosenRef, 'in', AlignmentFile + '.\nQuitting.', file=sys.stderr) exit(1) ChosenRefSeq = SeqDict[ChosenRef] # Define the set of unique primers, i.e. StartPrimers+EndPrimers but not # double counting those that appear in both. Record their lengths in a dict. AllUniquePrimers = StartPrimers + \ [primer for primer in EndPrimers if not primer in StartPrimers]
DuplicatedContigNames = [i for i in CounterObject if CounterObject[i]>1] if len(DuplicatedContigNames) != 0: for ContigName in DuplicatedContigNames: print('Contig name', ContigName, 'was duplicated in the arguments.', \ file=sys.stderr) print('All contig names should be unique. Exiting.', file=sys.stderr) exit(1) # Check the consensus name does not match one fo the contig names. if ConsensusName != None and ConsensusName in ContigNames: print('The consensus name should not be the same as one of the contig', \ 'names. Quitting.', file=sys.stderr) exit(1) # Read in the sequences from the alignment file (into a dictionary) AllSeqsDict, AlignmentLength = ReadSequencesFromFile(AlignmentFile) # Check the consensus is found if ConsensusName != None: if not ConsensusName in AllSeqsDict: print(ConsensusName, 'not found in', AlignmentFile + '. Quitting.', \ file=sys.stderr) exit(1) ConsensusSeq = AllSeqsDict[ConsensusName] # Separate sequences into references and contigs RefDict = {} ContigDict = {} for SeqName in AllSeqsDict: if SeqName in ContigNames: ContigDict[SeqName] = AllSeqsDict[SeqName]
# Check this file is called from the command line with one argument if len(sys.argv[1:]) != 1: print 'Incorrect number of arguments given.' print 'Usage:\n', sys.argv[0], 'NameOfYourFastaFile.fasta' exit(1) DataFile = sys.argv[1] # Check that the argument exists and is a file if not os.path.isfile(DataFile): print DataFile, 'does not exist or is not a file.' exit(1) # Read in the sequences as a dictionary. They are (nominally) not aligned. Aligned=False SeqDict, FirstSeqLength = ReadSequencesFromFile(DataFile, Aligned) # We are expecting two sequences. if len(SeqDict) != 2: print 'Expected 2 sequences;', DataFile, 'contains', str(len(SeqDict)) +\ '.\nQuitting.' exit(1) SeqNames = SeqDict.keys() Seqs = SeqDict.values() # If the two sequences are the same length, there is no shuffling to do. Print # them as they are. if len(Seqs[0]) == len(Seqs[1]): for SeqName, seq in SeqDict.items(): print '>'+SeqName
'specified.', file=sys.stderr) exit(1) # Rename arguments for brevity / clarity. MainAlnFile = args.MainAlignmentFile PairedAlnFile = args.PairedAlignmentFile ExciseUniqueInsertionsOfRefInMainAlignment = args.excise # Check that the arguments exist and are files for InputFile in [MainAlnFile, PairedAlnFile]: if not os.path.isfile(InputFile): print(InputFile, 'does not exist or is not a file.', file=sys.stderr) exit(1) # Read in the sequences from the main alignment file (into a dictionary) MainAlnSeqDict, MainAlnSeqLength = ReadSequencesFromFile(MainAlnFile) MainAlnSeqNames = MainAlnSeqDict.keys() MainAlnSeqs = MainAlnSeqDict.values() # Read in the sequences from the paired alignment file PairedAlnSeqDict, PairedAlnSeqLength = ReadSequencesFromFile(PairedAlnFile) # Check it has two sequences if len(PairedAlnSeqDict) != 2: print('File', PairedAlnFile, 'contains', len(PairedAlnSeqDict),\ 'sequences; two were expected.\nQuitting.', file=sys.stderr) exit(1) Seq1name, Seq2name = PairedAlnSeqDict.keys() # Check that one of the sequences is in the main alignment file (the 'Ref') # and one is not (the 'SeqToAdd').
for GapChar in GapChars: if GapChar in PrimerList[i]: print 'Primer', PrimerList[i], 'contains a gap. This is unexpected.'+\ '\nQuitting.' exit(1) CounterObject = collections.Counter(PrimerList) DuplicatedPrimers = [i for i in CounterObject if CounterObject[i]>1] if len(DuplicatedPrimers) != 0: for DuplicatedPrimer in DuplicatedPrimers: print 'Primer', DuplicatedPrimer, 'was specified twice with the same',\ 'option.' print 'Quitting.' exit(1) # Read in the sequences from the alignment file (into a dictionary) SeqDict, AlignmentLength = ReadSequencesFromFile(AlignmentFile) # Check the chosen reference is in the alignment if not ChosenRef in SeqDict: print 'Could not find', ChosenRef, 'in', AlignmentFile+'.\nQuitting.' exit(1) ChosenRefSeq = SeqDict[ChosenRef] # Define the set of unique primers, i.e. StartPrimers+EndPrimers but not # double counting those that appear in both. Record their lengths in a dict. AllUniquePrimers = StartPrimers + \ [primer for primer in EndPrimers if not primer in StartPrimers] NumUniquePrimers = len(AllUniquePrimers) PrimerLengths = {primer : len(primer) for primer in AllUniquePrimers} # Finds the position in the alignment, for each primer, after