コード例 #1
0
ファイル: mcc_eval.py プロジェクト: wangdi2014/lara
def parse_alignment(filename):
    if filename.endswith("aln"):
        ali = read(filename, 'clustal')
    else:
        ali = read(filename, 'fasta')
    proc = run(["RNAalifold", "--noPS", filename], capture_output=True, check=True, text=True)
    structure = proc.stdout.split('\n')[1].split(' ')[0]
    assert len(structure) == len(ali[0])
    return ali, structure
コード例 #2
0
def main():
    alignment = read(sys.stdin, 'fasta')
    remove_chars = np.asarray(list(sys.argv[1]))
    length = alignment.get_alignment_length()

    for start in range(length):
        if contains_none(alignment[:, start], remove_chars):
            break

    for end in range(length - 1, -1, -1):
        if contains_none(alignment[:, end], remove_chars):
            break

    out = alignment[:, start:end + 1]
    write(out, sys.stdout, 'fasta')
コード例 #3
0
def main():
    alignment = read(sys.stdin, "fasta")
    remove_chars = np.asarray(list(sys.argv[1]))
    length = alignment.get_alignment_length()
    print("Num sequences: %d" % len(alignment), file=sys.stderr)
    print("Alignment length: %d" % length, file=sys.stderr)

    keep_cols = []
    for i in tqdm(range(length)):
        if not contains_only(alignment[:, i], remove_chars):
            keep_cols.append(i)

    out = np.empty((len(alignment), len(keep_cols)), dtype="<U1")
    for i, j in tqdm(enumerate(keep_cols), total=len(keep_cols)):
        out[:, i] = np.array(list(alignment[:, j]))

    print("Remaining columns: %d" % out.shape[1], file=sys.stderr)
    for i, seq in enumerate(alignment):
        print(">" + seq.id)
        print("".join(out[i, :]))
コード例 #4
0
    def createFirstGuessReferenceFromReads(self):   
        #TODO: I should make this a commandline parameter. More = MSA takes longer. Less = worse reference
        msaReadCount = 4
        
        print ('I choose ' + str(msaReadCount) + ' random reads.'
            + '\nThese are aligned to form a rough initial consensus sequence. Here:'
            + '\n' + join(self.outputRootDirectory,'Initial_Reference')
            + '\nPerforming ClustalO Multiple Sequence Alignment Now...')
        try:            
            # Load Reads from File

            parsedReads = list(parse(self.readInput, self.readInputFormat))            
            referenceSequence = None

            
            # Reference Directory
            referenceDirectory = join(self.outputRootDirectory,'Initial_Reference')
            if not isdir(referenceDirectory):
                makedirs(referenceDirectory)
                        
            if (len(parsedReads) > msaReadCount):
                

                # Select a subset of reads for Multiple SequneceAlignment. Randomly, i guess.
                randomIndexes = list(range(0, len(parsedReads)))
                shuffle(randomIndexes)                
                rawClustalReads = []
                for i in range(0,msaReadCount):
                    rawClustalReads.append(parsedReads[randomIndexes[i]])
              
                rawClustalReadsFilename = join(referenceDirectory, 'MSARaw.fasta')                
                rawClustalReadsFileWriter = createOutputFile(rawClustalReadsFilename)        
                write(rawClustalReads, rawClustalReadsFileWriter, 'fasta')
                rawClustalReadsFileWriter.close()
            
                #Perform Clustal MSA
                clustalOAlignmentOutputFileName = join(referenceDirectory, 'clustalOAlignment.fasta')
                clustalOCommandLine = ClustalOmegaCommandline(infile=rawClustalReadsFilename, outfile=clustalOAlignmentOutputFileName, verbose=True, auto=True, force=True, threads=int(self.numberThreads))
                clustalOCommandLine()                
        
                # Calculate consensus 
                # A dumb consensus has lots of ambiguous nucleotides.  We'll polish those out later.
                alignmentType = 'fasta'    
                alignmentObject = read(clustalOAlignmentOutputFileName, alignmentType)           
                alignmentSummaryInfo = AlignInfo.SummaryInfo(alignmentObject)                
                dumbConsensus = alignmentSummaryInfo.dumb_consensus(threshold=.5)
                
                referenceSequence = SeqRecord(Seq(str(dumbConsensus) , IUPAC.IUPACUnambiguousDNA),
                    id='Initial_Consensus',
                    description='Initial_Consensus')

                
            # Else
            else:
                # Select the first read, use it as the reference. It's something.
                #referenceSequence = parsedReads[0]
                # You know what? we should just give up. There aren't enough reads to assemble.
                #raise Exception('Not enough reads to continue.')
                referenceSequence = SeqRecord(Seq('' , IUPAC.IUPACUnambiguousDNA),
                    id='Initial_Consensus',
                    description='Initial_Consensus')
                        
             
            #Write reference to file
            self.referenceSequenceFileName = join(referenceDirectory, 'FirstGuessReference.fasta')            
            firstGuessRefFileWriter = createOutputFile(self.referenceSequenceFileName)        
            write([referenceSequence], firstGuessRefFileWriter, 'fasta')

            firstGuessRefFileWriter.close()
            
            return self.referenceSequenceFileName
       
       
            print ('Done making initial consensus sequence.')
      
                                    
                                     
        except Exception:
            print ('Exception encountered in createFirstGuessReferenceFromReads()') 
            print (exc_info()[0])
            print (exc_info()[1])
            print (exc_info()[2]) 
            raise