Exemple #1
0
 def test_stockholm_from_alignment(self):
     """should return correct stockholm string."""
     self.assertEqual(stockholm_from_alignment({}),'')
     self.assertEqual(stockholm_from_alignment(self.alignment_dict),\
         self.stockholm_with_label)
     self.assertEqual(stockholm_from_alignment(self.alignment_dict,
             interleave_len=2),self.stockholm_with_label_lw2)
Exemple #2
0
 def test_stockholm_from_alignment_reordered(self):
     """should return correct stockholm string."""
     self.assertEqual(stockholm_from_alignment(self.alignment_object),\
         self.stockholm_with_label_reordered)
     self.assertEqual(
         stockholm_from_alignment(self.alignment_object, interleave_len=2),
         self.stockholm_with_label_lw2_reordered)
Exemple #3
0
 def test_stockholm_from_alignment(self):
     """should return correct stockholm string."""
     self.assertEqual(stockholm_from_alignment({}), '')
     self.assertEqual(stockholm_from_alignment(self.alignment_dict),\
         self.stockholm_with_label)
     self.assertEqual(
         stockholm_from_alignment(self.alignment_dict, interleave_len=2),
         self.stockholm_with_label_lw2)
Exemple #4
0
 def test_stockholm_from_alignment_struct(self):
     """should return correct stockholm string."""
     self.assertEqual(stockholm_from_alignment({},\
         GC_annotation=self.gc_annotation),'')
     self.assertEqual(stockholm_from_alignment(self.alignment_dict,\
         GC_annotation=self.gc_annotation),\
         self.stockholm_with_label_struct)
     self.assertEqual(stockholm_from_alignment(self.alignment_dict,\
         GC_annotation=self.gc_annotation,\
         interleave_len=2),self.stockholm_with_label_struct_lw2)
Exemple #5
0
 def test_stockholm_from_alignment_struct(self):
     """should return correct stockholm string."""
     self.assertEqual(stockholm_from_alignment({},\
         GC_annotation=self.gc_annotation),'')
     self.assertEqual(stockholm_from_alignment(self.alignment_dict,\
         GC_annotation=self.gc_annotation),\
         self.stockholm_with_label_struct)
     self.assertEqual(stockholm_from_alignment(self.alignment_dict,\
         GC_annotation=self.gc_annotation,\
         interleave_len=2),self.stockholm_with_label_struct_lw2)
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1):
    '''Function for multithreading
    creates the final BayesFold alignment and writes to files, then r2r struct'''
    try:
        #run locana-p on the superclusters to get the alignment and consensus structure
        #skip if already run and program just crashsed or whatever
        currotufolder = basefolder + "group_" + str(currgroup)
        if exists(currotufolder):
            return ""
        seqs = []
        count = 0
        out = "group " + str(currgroup) + ": "
        for header, seq in MinimalFastaParser(open(groupfasta, 'rU')):
            seqs.append((header.split()[0] + "_" + header.split("_")[1], seq))
            count += int(header.split("_")[1])
        out += "\n" + str(count) + " sequences\n"
        if count < minseqs:
            print currgroup + " has less than " + str(minseqs) + " sequences, skipping"
            return ""
        #make sure group has enough sequences before continuing
        #run BayesFold on the at most 50 most abundant sequences in the group
        aln, struct = bayesfold(seqs)
        #create output folder for group
        mkdir(currotufolder)
        if(aln.getNumSeqs() < 50):
            out += str(aln.getNumSeqs()) + " unique sequences\n"
            fout = open(currotufolder + "/unique.fasta", 'w')
            fout.write(aln.toFasta())
            fout.close()
        else:
            s, h = remove_duplicates(seqs)
            out += str(len(s)) + " unique sequences\n"
            write_fasta_list(s, currotufolder + "/unique.fasta")
        out += "Structure: " + struct + "\n"
        #write out alignment and structure in fasta and stockholm formats
        #write that shit
        logout = open(currotufolder + "/log.txt", 'w')
        logout.write(out)
        logout.close()
        alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w')
        alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
        alnout.close()
        alnout = open(currotufolder + "/bayesfold-aln.sto", 'w')
        struct_dict = {'SS_cons': struct}
        alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict))
        alnout.close()
        #make R2R secondary structure for alignment
        make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup))
    except Exception, e:
        print str(e)
        stdout.flush()
def run_locarnap_for_infernal(currgroup, clusters, otus, basefolder):
    '''Function for multithreading
    creates the final locarna-p alignment and writes to files, then r2r struct'''
    #run locana-p on the superclusters to get the alignment and consensus structure
    #skip if already run and program just crashsed or whatever
    currotufolder = basefolder + "group_" + str(currgroup)
    if exists(currotufolder):
        return ""
    seqs = []
    out = "group " + str(currgroup) + ": "
    for cluster in clusters:
        out += cluster + " "
        count = 0
        for header, seq in MinimalFastaParser(open(otus[cluster], 'rU')):
            seqs.append((header.split()[0], seq))
            count += int(header.split("_")[1])
    out += "\n" + str(count) + " sequences\n"
    #make sure group has enough sequences before continuing
    #run locarna-p on the at most 50 most abundant sequences in the group
    aln, struct = run_locarnap(seqs, 50, cpus=2, foldless=True)

    #create output folder for group
    mkdir(currotufolder)
    if(aln.getNumSeqs() < 50):
        out += str(aln.getNumSeqs()) + " unique sequences\n"
        fout = open(currotufolder + "/unique.fasta", 'w')
        fout.write(aln.toFasta())
        fout.close()
    else:
        s, h = remove_duplicates(seqs)
        out += str(len(s)) + " unique sequences\n"
        write_fasta_list(s, currotufolder + "/unique.fasta")
    out += "Structure: " + struct + "\n"

    #write out alignment and structure in fasta and stockholm formats
    #write that shit
    logout = open(currotufolder + "/log.txt", 'w')
    logout.write(out)
    logout.close()
    alnout = open(currotufolder + "/locarnap-aln.fasta", 'w')
    alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
    alnout.close()
    alnout = open(currotufolder + "/locarnap-aln.sto", 'w')
    struct_dict = {'SS_cons': struct}
    alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict))
    alnout.close()
    #make R2R secondary structure for alignment
    make_r2r(currotufolder + "/locarnap-aln.sto", currotufolder, "group_" + str(currgroup))
Exemple #8
0
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1):
    '''Function for multithreading. Creates the final BayesFold alignment and 
    writes to files, then r2r struct'''
    try:
        #run locana-p on the superclusters to get alignment and structure
        #skip if already run and program just crashed or whatever
        currotufolder = basefolder + "group_" + str(currgroup)
        if exists(currotufolder):
            return ""
        seqs = []
        count = 0
        out = "group " + str(currgroup) + ": "
        for header, seq in MinimalFastaParser(open(groupfasta, 'rU')):
            seqs.append((header.split()[0] + "_" + header.split("_")[1], seq))
            count += int(header.split("_")[1])
        out += "\n" + str(count) + " sequences\n"
        if count < minseqs:
            return ""
        stdout.flush()
        #hard limit of 500 sequences to align and fold for memory reasons
        if len(seqs) > 500:
            seqs = seqs[:500]
        #run BayesFold on sequences in the group
        #maxiters set to 5 because should have huge amount of sequences for some groups
        aln, struct = bayesfold(seqs, params={"-diags": True})
        #create output folder for group
        mkdir(currotufolder)
        out += str(aln.getNumSeqs()) + " unique sequences\n"
        out += "Structure: " + struct + "\n"
        #write out alignment and structure in fasta and stockholm formats
        #write that shit
        logout = open(currotufolder + "/log.txt", 'w')
        logout.write(out)
        logout.close()
        alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w')
        alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
        alnout.close()
        alnout = open(currotufolder + "/bayesfold-aln.sto", 'w')
        struct_dict = {'SS_cons': struct}
        alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict))
        alnout.close()
        #make R2R secondary structure for alignment
        make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup))
    except Exception, e:
        print str(e)
        stdout.flush()
Exemple #9
0
 def test_stockholm_from_alignment_reordered(self):
     """should return correct stockholm string."""
     self.assertEqual(stockholm_from_alignment(self.alignment_object),\
         self.stockholm_with_label_reordered)
     self.assertEqual(stockholm_from_alignment(self.alignment_object,
             interleave_len=2),self.stockholm_with_label_lw2_reordered)
Exemple #10
0
def create_final_output(groupfasta, basefolder, minseqs=1, cpus=1):
    '''Function for multithreading. Creates the final BayesFold alignment and
    writes to files, then r2r struct and infernal CM file'''
    # skip if already run and program just crashed or whatever
    currgroup = groupfasta.split("/")[-1].split(".")[0]
    currotufolder = basefolder + currgroup
    if exists(currotufolder):
        return

    # load seqs and make sure we have enough
    aln = LoadSeqs(groupfasta, moltype=RNA, aligned=True)
    count = count_seqs(aln.Names)
    if count < minseqs:
        return
    # get weights for each sequence. weight==count
    weights = []
    maxweight = 0
    for header in aln.Names:
        weight = count_seqs(header)
        if weight > maxweight:
            maxweight = weight
        weights.append(header.split()[0])
        weights.append(str(weight))

    # fold alignment with bayesfold
    aln, struct = bayesfold(aln, align=False)

    # write log information
    mkdir(currotufolder)
    with open(currotufolder + "/log.txt", 'w') as logout:
        logout.write(' '.join([
            currgroup, ":\n",
            str(count), "sequences\n",
            str(aln.getNumSeqs()), "unique sequences\nStructure: ", struct,
            "\n"
        ]))
    # write out alignment and structure in fasta format
    with open(currotufolder + "/bayesfold-aln.fasta", 'w') as alnout:
        alnout.write(">SS_cons\n%s\n%s" % (struct, aln.toFasta()))

    # shave off info in header for stockholm
    aln = LoadSeqs(data=aln, moltype=RNA, label_to_name=lambda x: x.split()[0])
    # create stockholm formatted alignment
    sto = stockholm_from_alignment(aln, GC_annotation={'SS_cons': struct})
    del aln
    # create standard weights for infernal
    infweights = ""
    for pos in range(0, len(weights), 2):
        infweights = ''.join([
            infweights,
            '# =GS %s WT %s\n' %
            (weights[pos], str(float(weights[pos + 1]) / maxweight))
        ])
    # create weights for r2r
    r2r_weights = "# =GF USE_THIS_WEIGHT_MAP " + ' '.join(weights)
    # create sto file with r2r and std weights
    sto = sto.split("\n")
    sto[-1] = infweights.strip()
    sto.append(r2r_weights)
    sto.append("//\n")
    stofile = currotufolder + "/bayesfold-aln.sto"
    with open(stofile, 'w') as alnout:
        alnout.write('\n'.join(sto))

    # make R2R secondary structure for alignment
    make_r2r(stofile, currotufolder, currgroup)
    # create CM file for infernal from group
    cmbuild_from_file(stofile,
                      currotufolder + "/cmfile.cm",
                      params={'--wgiven': True})
    calibrate_cmfile(currotufolder + "/cmfile.cm", cpus=cpus)
    def setUp(self):
        """Infernal general setUp method for all tests"""
        self.seqs1_unaligned = {'1':'ACUGCUAGCUAGUAGCGUACGUA',\
                                '2':'GCUACGUAGCUAC',\
                                '3':'GCGGCUAUUAGAUCGUA'}
        self.struct1_unaligned_string = '....(((...)))....'
        self.seqs1_unaligned_gaps = {'1':'ACUGCUAGCUAGU-AGCGUAC--GUA',\
                                     '2':'--GCUACGUAGCUAC',\
                                     '3':'GCGGCUAUUAGAUCGUA--'}



        self.seqs2_aligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC---------',\
                              'c': '------------UGACUACGCAU---------',\
                              'b': '----UAUCGCUUCGACGAUUCUCUGAUAGAGA'}

        self.seqs2_unaligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC',\
                                'c': 'UGACUACGCAU',\
                                'b': 'UAUCGCUUCGACGAUUCUCUGAUAGAGA'}

        self.struct2_aligned_string = '............((.(...)))..........'
        self.struct2_aligned_dict = {'SS_cons':self.struct2_aligned_string}

        self.lines2 = stockholm_from_alignment(aln=self.seqs2_aligned,\
            GC_annotation=self.struct2_aligned_dict)

        #self.seqs1 aligned to self.seqs2 with self.seqs2 included.
        self.seqs1_and_seqs2_aligned = \
            {'a': 'UAGGCUCUGAUAUAAUAGC-UCUC---------',\
             'b': '----UAUCGCUUCGACGAU-UCUCUGAUAGAGA',\
             'c': '------------UGACUAC-GCAU---------',\
             '1': '-ACUGCUAGCUAGUAGCGUACGUA---------',\
             '2': '----------GCUACGUAG-CUAC---------',\
             '3': '-----GCGGCUAUUAG-AU-CGUA---------',\
             }

        self.seqs1_and_seqs2_aligned_struct_string = \
            '............((.(....)))..........'

        #self.seqs1 aligned to self.seqs2 without self.seqs2 included.
        self.seqs1_aligned = \
            {'1': 'ACUGCUAGCUAGUAGCGUACGUA',\
             '2': '---------GCUACGUAG-CUAC',\
             '3': '----GCGGCUAUUAG-AU-CGUA',\
             }

        self.seqs1_aligned_struct_string = \
            '...........((.(....))).'

        self.temp_dir = tempfile.mkdtemp()
        self.temp_dir_spaces = '/tmp/test for infernal/'
        try:
            mkdir(self.temp_dir_spaces)
        except OSError:
            pass
        try:
            #create sequence files
            f = open(path.join(self.temp_dir, 'seqs1.sto'),'w')
            f.write(self.lines2)
            f.close()
            #create cm file.
            self.cmfile = path.join(self.temp_dir, 'aln2.cm')
            cm = open(self.cmfile,'w')
            cm.write(ALN1_CM)
            cm.close()
            #create alignment file used to create cm file.
            self.aln2_file = path.join(self.temp_dir, 'aln2.sto')
            af = open(self.aln2_file,'w')
            af.write(self.lines2)
            af.close()
        except OSError:
            pass
Exemple #12
0
 def setUp(self):
     """Infernal general setUp method for all tests"""
     self.seqs1_unaligned = {'1':'ACUGCUAGCUAGUAGCGUACGUA',\
                             '2':'GCUACGUAGCUAC',\
                             '3':'GCGGCUAUUAGAUCGUA'}
     self.struct1_unaligned_string = '....(((...)))....'
     self.seqs1_unaligned_gaps = {'1':'ACUGCUAGCUAGU-AGCGUAC--GUA',\
                                  '2':'--GCUACGUAGCUAC',\
                                  '3':'GCGGCUAUUAGAUCGUA--'}
     
     
     
     self.seqs2_aligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC---------',\
                           'c': '------------UGACUACGCAU---------',\
                           'b': '----UAUCGCUUCGACGAUUCUCUGAUAGAGA'}
     
     self.seqs2_unaligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC',\
                             'c': 'UGACUACGCAU',\
                             'b': 'UAUCGCUUCGACGAUUCUCUGAUAGAGA'}
     
     self.struct2_aligned_string = '............((.(...)))..........'
     self.struct2_aligned_dict = {'SS_cons':self.struct2_aligned_string}
     
     self.lines2 = stockholm_from_alignment(aln=self.seqs2_aligned,\
         GC_annotation=self.struct2_aligned_dict)
     
     #self.seqs1 aligned to self.seqs2 with self.seqs2 included.
     self.seqs1_and_seqs2_aligned = \
         {'a': 'UAGGCUCUGAUAUAAUAGC-UCUC---------',\
          'b': '----UAUCGCUUCGACGAU-UCUCUGAUAGAGA',\
          'c': '------------UGACUAC-GCAU---------',\
          '1': '-ACUGCUAGCUAGUAGCGUACGUA---------',\
          '2': '----------GCUACGUAG-CUAC---------',\
          '3': '-----GCGGCUAUUAG-AU-CGUA---------',\
          }
          
     self.seqs1_and_seqs2_aligned_struct_string = \
         '............((.(....)))..........'
     
     #self.seqs1 aligned to self.seqs2 without self.seqs2 included.
     self.seqs1_aligned = \
         {'1': 'ACUGCUAGCUAGUAGCGUACGUA',\
          '2': '---------GCUACGUAG-CUAC',\
          '3': '----GCGGCUAUUAG-AU-CGUA',\
          }
          
     self.seqs1_aligned_struct_string = \
         '...........((.(....))).'
     
     self.temp_dir = tempfile.mkdtemp()
     self.temp_dir_spaces = '/tmp/test for infernal/'
     try:
         mkdir(self.temp_dir_spaces)
     except OSError:
         pass
     try:
         #create sequence files
         f = open(path.join(self.temp_dir, 'seqs1.sto'),'w')
         f.write(self.lines2)
         f.close()
         #create cm file.
         self.cmfile = path.join(self.temp_dir, 'aln2.cm')
         cm = open(self.cmfile,'w')
         cm.write(ALN1_CM)
         cm.close()
         #create alignment file used to create cm file.
         self.aln2_file = path.join(self.temp_dir, 'aln2.sto')
         af = open(self.aln2_file,'w')
         af.write(self.lines2)
         af.close()
     except OSError:
         pass
        args = {"--cpus": "24"}
        aln, struct = create_locarnap_alignment(seqs, RNA, struct=True, params=args)
        # create output folder for OTU
        otufolder = "/Users/Ely/Desktop/Ely_selection/R7/lead_clusters/"
        if not exists(otufolder):
            mkdir(otufolder)
        otufolder += otu
        if not exists(otufolder):
            mkdir(otufolder)
        # print out alignment and structure in fasta and stockholm formats
        alnout = open(otufolder + "/locarnap-aln.fasta", "w")
        alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
        alnout.close()
        alnout = open(otufolder + "/locarnap-aln.sto", "w")
        struct_dict = {"SS_cons": struct}
        alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict))
        alnout.close()
        print struct
        # CLUSTER THE SECONDA

        print "Creating CM and running Infernal over all rounds"
        # create the cm file. Could call cmsearch_from_alignment but dont want to build
        # cm file multiple times since is time consuming and processor intensive
        cmfile = cmbuild_from_alignment(aln, struct, calibrate=True)
        for i in range(7, 0, -1):
            # run cmsearch over every round of SELEX
            # Only search unique sequences to save time
            seqs = LoadSeqs(
                "/Users/Ely/Desktop/Ely_selection/R" + str(i) + "/R" + str(i) + "-Unique.fasta",
                moltype=RNA,
                aligned=False,