def parse_alignment(filename): if filename.endswith("aln"): ali = read(filename, 'clustal') else: ali = read(filename, 'fasta') proc = run(["RNAalifold", "--noPS", filename], capture_output=True, check=True, text=True) structure = proc.stdout.split('\n')[1].split(' ')[0] assert len(structure) == len(ali[0]) return ali, structure
def main(): alignment = read(sys.stdin, 'fasta') remove_chars = np.asarray(list(sys.argv[1])) length = alignment.get_alignment_length() for start in range(length): if contains_none(alignment[:, start], remove_chars): break for end in range(length - 1, -1, -1): if contains_none(alignment[:, end], remove_chars): break out = alignment[:, start:end + 1] write(out, sys.stdout, 'fasta')
def main(): alignment = read(sys.stdin, "fasta") remove_chars = np.asarray(list(sys.argv[1])) length = alignment.get_alignment_length() print("Num sequences: %d" % len(alignment), file=sys.stderr) print("Alignment length: %d" % length, file=sys.stderr) keep_cols = [] for i in tqdm(range(length)): if not contains_only(alignment[:, i], remove_chars): keep_cols.append(i) out = np.empty((len(alignment), len(keep_cols)), dtype="<U1") for i, j in tqdm(enumerate(keep_cols), total=len(keep_cols)): out[:, i] = np.array(list(alignment[:, j])) print("Remaining columns: %d" % out.shape[1], file=sys.stderr) for i, seq in enumerate(alignment): print(">" + seq.id) print("".join(out[i, :]))
def createFirstGuessReferenceFromReads(self): #TODO: I should make this a commandline parameter. More = MSA takes longer. Less = worse reference msaReadCount = 4 print ('I choose ' + str(msaReadCount) + ' random reads.' + '\nThese are aligned to form a rough initial consensus sequence. Here:' + '\n' + join(self.outputRootDirectory,'Initial_Reference') + '\nPerforming ClustalO Multiple Sequence Alignment Now...') try: # Load Reads from File parsedReads = list(parse(self.readInput, self.readInputFormat)) referenceSequence = None # Reference Directory referenceDirectory = join(self.outputRootDirectory,'Initial_Reference') if not isdir(referenceDirectory): makedirs(referenceDirectory) if (len(parsedReads) > msaReadCount): # Select a subset of reads for Multiple SequneceAlignment. Randomly, i guess. randomIndexes = list(range(0, len(parsedReads))) shuffle(randomIndexes) rawClustalReads = [] for i in range(0,msaReadCount): rawClustalReads.append(parsedReads[randomIndexes[i]]) rawClustalReadsFilename = join(referenceDirectory, 'MSARaw.fasta') rawClustalReadsFileWriter = createOutputFile(rawClustalReadsFilename) write(rawClustalReads, rawClustalReadsFileWriter, 'fasta') rawClustalReadsFileWriter.close() #Perform Clustal MSA clustalOAlignmentOutputFileName = join(referenceDirectory, 'clustalOAlignment.fasta') clustalOCommandLine = ClustalOmegaCommandline(infile=rawClustalReadsFilename, outfile=clustalOAlignmentOutputFileName, verbose=True, auto=True, force=True, threads=int(self.numberThreads)) clustalOCommandLine() # Calculate consensus # A dumb consensus has lots of ambiguous nucleotides. We'll polish those out later. alignmentType = 'fasta' alignmentObject = read(clustalOAlignmentOutputFileName, alignmentType) alignmentSummaryInfo = AlignInfo.SummaryInfo(alignmentObject) dumbConsensus = alignmentSummaryInfo.dumb_consensus(threshold=.5) referenceSequence = SeqRecord(Seq(str(dumbConsensus) , IUPAC.IUPACUnambiguousDNA), id='Initial_Consensus', description='Initial_Consensus') # Else else: # Select the first read, use it as the reference. It's something. #referenceSequence = parsedReads[0] # You know what? we should just give up. There aren't enough reads to assemble. #raise Exception('Not enough reads to continue.') referenceSequence = SeqRecord(Seq('' , IUPAC.IUPACUnambiguousDNA), id='Initial_Consensus', description='Initial_Consensus') #Write reference to file self.referenceSequenceFileName = join(referenceDirectory, 'FirstGuessReference.fasta') firstGuessRefFileWriter = createOutputFile(self.referenceSequenceFileName) write([referenceSequence], firstGuessRefFileWriter, 'fasta') firstGuessRefFileWriter.close() return self.referenceSequenceFileName print ('Done making initial consensus sequence.') except Exception: print ('Exception encountered in createFirstGuessReferenceFromReads()') print (exc_info()[0]) print (exc_info()[1]) print (exc_info()[2]) raise