def ReadReferenceFromFile(File): '''Read in all sequences in the reference file; check there is only one.''' AllSequences, ReferenceLength = ReadSequencesFromFile(File,False) if len(AllSequences) != 1: print('Found', len(AllSequences), 'sequences in', ReferenceFile+\ '; expected 1.\nQuitting.', file=sys.stderr) exit(1) return AllSequences.items(), ReferenceLength
Aligned=False SeqDict, FirstSeqLength = ReadSequencesFromFile(DataFile, Aligned) # We are expecting two sequences. if len(SeqDict) != 2: print 'Expected 2 sequences;', DataFile, 'contains', str(len(SeqDict)) +\ '.\nQuitting.' exit(1) SeqNames = SeqDict.keys() Seqs = SeqDict.values() # If the two sequences are the same length, there is no shuffling to do. Print # them as they are. if len(Seqs[0]) == len(Seqs[1]): for SeqName, seq in SeqDict.items(): print '>'+SeqName print seq exit(0) # Find which of the two sequences is the shorter one if len(SeqDict[SeqNames[0]]) < len(SeqDict[SeqNames[1]]): ShorterSeqName, ShorterSeq, LongerSeqName, LongerSeq = \ SeqNames[0], SeqDict[SeqNames[0]], SeqNames[1], SeqDict[SeqNames[1]] else: ShorterSeqName, ShorterSeq, LongerSeqName, LongerSeq = \ SeqNames[1], SeqDict[SeqNames[1]], SeqNames[0], SeqDict[SeqNames[0]] LenLongerSeq = len(LongerSeq) deficit = LenLongerSeq - len(ShorterSeq)
SortedList = [['start_of_'+primer,value] for primer,value in \ StartPrimerPositions.items()] + [['end_of_'+primer,value] for primer,value in \ EndPrimerPositions.items()] SortedList = sorted(SortedList, key=lambda item: item[1]) if args.AlignmentCoords: print(' '.join(str(value) for key, value in SortedList)) exit(0) # Now convert those primer positions, which were with respect to the alignment, # into positions with respect to each reference. # For each primer, sorted left to right, count how many bases to the left # (ignoring gaps). Only count once through the genome, stopping and restarting # when we get to each primer. PositionsDict = {} for SeqName, seq in SeqDict.items(): PositionsWRTseq = [] LastPositionWRTalignment = 0 PositionWRTseq = 0 for [name, PrimerPosition] in SortedList: for base in seq[LastPositionWRTalignment:PrimerPosition]: if not base in GapChars: PositionWRTseq += 1 PositionsWRTseq.append(PositionWRTseq) LastPositionWRTalignment = PrimerPosition # Replace any zeroes by ones (i.e. map positions off to the left onto the # first position for this sequence). for i in range(0, len(PositionsWRTseq)): if PositionsWRTseq[i] == 0: PositionsWRTseq[i] = 1
# alignment has a gap there in which case we just skip. b) If it's not a # GapChar, and is a unique insertion (i.e. at that position in the main # alignment only that sequence has a base), we want to excise that base from # SeqToAdd. c) If it's neither a GapChar nor a unique insertion, we use that # base from SeqToAdd. SeqToAdd_WithGaps = '' ReferenceWithoutInsertions = '' PositionInSeqToAdd = 0 PositionInFinalAln = 0 TranslationRecord = '"position w.r.t. final alignment","position w.r.t. ref","base"' for MainPosition, BaseInMainRef in enumerate(RefSeqFromMain): if BaseInMainRef == GapChar: EveryBaseHereIsAGap = False if ExciseUniqueInsertionsOfRefInMainAlignment: EveryBaseHereIsAGap = True for SeqName, seq in MainAlnSeqDict.items(): if SeqName == RefSeqName: continue elif seq[MainPosition] != GapChar: EveryBaseHereIsAGap = False break if EveryBaseHereIsAGap: continue SeqToAdd_WithGaps += GapChar ReferenceWithoutInsertions += GapChar PositionInFinalAln += 1 TranslationRecord += '\n' + str(PositionInFinalAln) + ',-,-' else: SeqBase = SeqToAdd[PositionInSeqToAdd] RefInMainHasUniqueInsertion = False if ExciseUniqueInsertionsOfRefInMainAlignment: