コード例 #1
0
 def decode_targets(self):
     f = open(self.filename)
     # make sure to recognize chromosome number
     data = f.readline()[:-1]
     while data != "REPEATS":
         data = f.readline()[:-1]
         # parse location and sequence
         midpoint = data.find(',')
         location = data[:midpoint]
         sequence = data[midpoint + 1:]
         # decompress the location and sequence information
         s = SeqTranslate()
         location = s.decompress64(location, toseq=False)
         sequence = s.decompress64(sequence, toseq=True)
         # add location to storage vector
         self.targets.append((location, sequence))
コード例 #2
0
class CasperQuick:
    def __init__(self, casper_seq_file, output_file_path, ofa):
        self.csffile = casper_seq_file
        self.ST = SeqTranslate()
        self.allTargets = {}
        self.location = tuple()
        self.output = output_file_path
        self.off_target_all = ofa

    def loadGenesandTargets(self, rk):
        region_keggs = rk
        for region_kegg in region_keggs:
            self.allTargets[str(region_kegg)] = list()
            if type(region_kegg) == tuple:
                self.location = region_kegg
            else:
                k = Kegg()
                self.location = k.gene_locator(region_kegg)
            myfy = open(self.csffile)
            while True:
                line = myfy.readline()
                if line == '':
                    break
                if line.find('CHROMOSOME') != -1:
                    s = line.find("#")
                    if line[s + 1:-1] == str(
                            self.location[0]
                    ):  # checks to see if it is on the right chromosome
                        curpos = int()
                        while curpos < int(self.location[1]):
                            line = myfy.readline()
                            curpos = self.ST.decompress64(line.split(',')[0])
                        while curpos < int(self.location[2]):
                            line = self.ST.decompress_csf_tuple(
                                myfy.readline()[:-1])
                            curpos = line[0]
                            self.allTargets[str(region_kegg)].append(line)
                        break
            myfy.close()

        self.printoutresultstofile()

    def printoutresultstofile(self):
        out = self.output + "quickresults.txt"
        f = open(out, 'w')
        for item in self.allTargets.keys():
            f.write(item)
            f.write('\n')
            for target in self.allTargets[item]:
                insert = str(target[0]) + "," + str(target[1]) + "," + str(
                    target[2]) + '\n'
                f.write(insert)
        f.close()
コード例 #3
0
ファイル: multitargeting.py プロジェクト: Ktailor34/CASPERapp
 def get_instances(self):
     ST = SeqTranslate()
     os.chdir(path)
     f = open(self.file_name, 'r')
     while True:
         x = f.readline()
         if x == 'REPEATS\n':
             print("reached repeat sequences")
             break
     while True:
         t = f.readline()
         if t == 'END_OF_FILE':
             print("reached end of repeat sequences")
             break
         ukey = t[:-1]  # takes away the "\n" in the string
         key = ST.decompress64(ukey, slength=20, toseq=True)
         key = ST.fill_As(key, 16)
         self.BAD_instances[key] = list()
         # Add sequences and locations to the list
         v = f.readline().split('\t')[:-1]
         for item in v:
             loctup = item.split(',')
             chrom = loctup[0]
             location = ST.decompress64(loctup[1])
             seq = ST.decompress64(loctup[2][1:], slength=20, toseq=True)
             seq = ST.fill_As(
                 seq, 4
             )  # when A's get lost in the compression this fills them back in
             mytup = (chrom, location, seq)
             self.BAD_instances[key].append(mytup)
     f.close()
     print("currently sorting")
     for key in self.BAD_instances:
         size = len(self.BAD_instances[key])
         newtuple = (key, self.BAD_instances[key], size
                     )  # sequence, location, size
         self.sorted_instances.append(newtuple)
コード例 #4
0
ファイル: multitargeting.py プロジェクト: Ktailor34/CASPERapp
class Multitargeting(QtWidgets.QMainWindow):

    BAD_instances = {}
    sorted_instances = []

    def __init__(self, parent=None):

        super(Multitargeting, self).__init__()
        uic.loadUi('multitargetingwindow.ui', self)
        self.setWindowIcon(QtGui.QIcon("cas9image.png"))
        # Storage containers for the repeats and seed sequences
        self.sq = SeqTranslate()  # SeqTranslate object used in class

        # Initializes the three graphs
        self.chart_view_chro_bar = QChartView()
        self.chart_view_repeat_bar = QChartView()
        self.chart_view_repeat_line = QChartView()

        self.data = ""
        self.shortHand = ""
        self.chromo_length = list()

        # Listeners for changing the seed sequence or the .cspr file
        self.max_chromo.currentIndexChanged.connect(self.fill_seed_id_chrom)
        self.min_chromo.currentIndexChanged.connect(self.fill_seed_id_chrom)
        self.chromo_seed.currentIndexChanged.connect(self.chro_bar_data)
        self.Analyze_Button.clicked.connect(self.make_graphs)

        #go back to main button
        self.back_button.clicked.connect(self.go_back)

        #Tool Bar options
        self.actionCASPER.triggered.connect(self.changeto_main)

        # Statistics storage variables
        self.max_repeats = 1
        self.average = 0
        self.median = 0
        self.mode = 0
        self.average_unique = 0
        self.average_rep = 0
        self.bar_coords = []
        self.seed_id_seq_pair = {}
        self.positions = []

        #parser object
        self.parser = CSPRparser("")

        self.ready_chromo_min_max = True
        self.ready_chromo_make_graph = True
        self.directory = 'Cspr files'
        self.info_path = os.getcwd()

        ##################################
        self.scene = QtWidgets.QGraphicsScene()
        self.graphicsView.setScene(self.scene)
        self.scene2 = QtWidgets.QGraphicsScene()
        self.graphicsView_2.setScene(self.scene2)
        self.graphicsView.viewport().installEventFilter(self)

    def eventFilter(self, source, event):
        if (event.type() == QtCore.QEvent.MouseMove
                and source is self.graphicsView.viewport()):
            coord = self.graphicsView.mapToScene(event.pos())
            first = True
            for i in self.bar_coords:
                ind = i[0]
                x = i[1]
                y1 = i[2]
                y2 = i[3]
                dups = 0
                if ((coord.x() == x or coord.x() == x + 1
                     or coord.x() == x - 1)
                        and (coord.y() >= y1 and coord.y() <= y2)):

                    listtemp = []
                    for a in self.bar_coords:
                        if (x == a[1] and y1 == a[2] and y2 == a[3]):
                            listtemp.append(a)
                            dups += 1
                    self.scene2 = QtWidgets.QGraphicsScene()
                    self.graphicsView_2.setScene(self.scene2)
                    #self.graphicsView_2.hide()
                    output = str()
                    i = 1
                    for item in listtemp:

                        ind = item[0]
                        seq = str(self.seq_data[ind])
                        seed_id = self.seed_id_seq_pair[seq]
                        temp = self.parser.dec_tup_data[seed_id]
                        temp = temp[ind]
                        if len(listtemp) > 1 and i < len(listtemp):
                            output += 'Location: ' + str(
                                temp[0]) + ' | Seq: ' + str(
                                    temp[1]) + ' | PAM: ' + str(
                                        temp[2]) + ' | SCR: ' + str(
                                            temp[3]) + ' | DIRA: ' + str(
                                                temp[4]) + '\n'
                        else:
                            output += 'Location: ' + str(
                                temp[0]) + ' | Seq: ' + str(
                                    temp[1]) + ' | PAM: ' + str(
                                        temp[2]) + ' | SCR: ' + str(
                                            temp[3]) + ' | DIRA: ' + str(
                                                temp[4])
                        i += 1
                    text = self.scene2.addText(output)
                    #self.graphicsView_2.adjustSize()
                    font = QtGui.QFont()
                    font.setBold(True)
                    font.setPointSize(9)
                    text.setFont(font)

        return Qt.QWidget.eventFilter(self, source, event)

    def launch(self, path):
        os.chdir(path)
        self.directory = path
        self.get_data()
        self.make_graphs()

    def get_data(self):
        onlyfiles = [
            f for f in os.listdir(self.directory)
            if os.path.isfile(os.path.join(self.directory, f))
        ]
        print(onlyfiles)
        orgsandendos = {}
        shortName = {}
        self.endo_drop.clear()
        for file in onlyfiles:
            if file.find('.cspr') != -1:
                newname = file[0:-4]
                s = newname.split('_')
                hold = open(file)
                buf = (hold.readline())
                species = buf[8:buf.find('\n')]
                endo = str(s[1])
                if species not in shortName:
                    shortName[species] = s[0]
                if species in orgsandendos:
                    orgsandendos[species].append(endo)
                else:
                    orgsandendos[species] = [endo]
                    if self.organism_drop.findText(species) == -1:
                        self.organism_drop.addItem(species)
        self.data = orgsandendos
        self.shortHand = shortName
        temp = self.data[str(self.organism_drop.currentText())]
        temp1 = []
        for i in temp:
            i = i.strip('.')
            temp1.append(i)
        self.endo_drop.addItems(temp1)
        self.organism_drop.currentIndexChanged.connect(self.changeEndos)

    def changeEndos(self):
        self.endo_drop.clear()
        temp = self.data[str(self.organism_drop.currentText())]
        temp1 = []
        for i in temp:
            i = i.strip('.')
            temp1.append(i)
            print(i)
        print(temp1)
        self.endo_drop.addItems(temp1)

    def make_graphs(self):
        #get the correct file name
        self.chromo_length.clear()
        file_name = self.shortHand[self.organism_drop.currentText(
        )] + "_" + self.endo_drop.currentText()
        if self.directory.find("/") != -1:
            file = (self.directory + "/" + file_name + ".cspr")
        else:
            file = (self.directory + "\\" + file_name + ".cspr")

        #set up parser, and get the repeats and carry stats
        self.parser.fileName = file
        print(self.endo_drop.currentText())
        self.parser.read_repeats(self.endo_drop.currentText())
        self.parser.read_chromesome(self.endo_drop.currentText())
        self.parser.read_first_lines()
        self.chromo_length = self.parser.karystatsList

        #calculations and setting the windows
        self.average_rep = self.parser.multiSum / self.parser.multiCount
        self.plot_repeats_vs_seeds()
        self.bar_seeds_vs_repeats()
        self.fill_min_max()
        #self.chro_bar_data()
        self.nbr_seq.setText(str(len(self.parser.seeds)))
        self.nbr_unq.setText(str(self.parser.uniq_seq_count()))
        self.avg_rep.setText(str(self.average))
        self.med_rep.setText(str(self.median))
        self.mode_rep.setText(str(self.mode))
        self.scr_lbl.setText(str(self.average_rep))

    #fill in chromo bar visualization
    def chro_bar_data(self):

        if self.ready_chromo_make_graph == False:
            return
        dic_info = {}
        seqLength = int(self.sq.endo_info[self.endo_drop.currentText()][1])
        for seed in self.parser.seeds:
            temp = seed
            temp1 = str(
                self.sq.decompress64(temp, slength=seqLength, toseq=True))
            self.seed_id_seq_pair[temp1] = seed
            dic_info[temp1] = {}
            for repeat in self.parser.seeds[seed]:
                if repeat[0] in dic_info[temp1]:
                    dic_info[temp1][repeat[0]].append(
                        self.sq.decompress64(repeat[1]))
                else:
                    dic_info[temp1][repeat[0]] = [
                        self.sq.decompress64(repeat[1])
                    ]
        self.chro_bar_create(dic_info)
        self.fill_Chromo_Text(dic_info)

    #fill in chromo bar visualization
    def fill_Chromo_Text(self, info):
        chromo_pos = {}
        self.seq_data = []
        self.positions.clear()
        chomonum = 0
        for chromo in info[self.chromo_seed.currentText()]:
            pos = []
            for position in info[(self.chromo_seed.currentText())][chromo]:
                self.seq_data.append(self.chromo_seed.currentText())
                test1 = position / self.chromo_length[int(chromo) - 1]
                test1 = int(test1 * 485)
                self.positions.append(test1)
                pos.append(test1)

            chromo_pos[chromo] = pos
            chomonum += 1

        i = 0
        self.scene = QtWidgets.QGraphicsScene()
        self.graphicsView.setScene(self.scene)
        self.bar_coords.clear()  #clear bar_coords list before creating visual
        ind = 0
        for chromo in chromo_pos:
            pen_blk = QtGui.QPen(QtCore.Qt.black)
            pen_red = QtGui.QPen(QtCore.Qt.red)
            pen_blk.setWidth(3)
            pen_red.setWidth(3)
            if i == 0:
                text = self.scene.addText(str(chromo))
                text.setPos(0, 0)
                font = QtGui.QFont()
                font.setBold(True)
                font.setPointSize(10)
                text.setFont(font)
                self.scene.addRect(40, (i * 25), 525, 25, pen_blk)

            else:
                text = self.scene.addText(str(chromo))
                font = QtGui.QFont()
                font.setBold(True)
                font.setPointSize(10)
                text.setFont(font)
                text.setPos(0, i * 25 + 10 * i)

                self.scene.addRect(40, (i * 25) + 10 * i, 525, 25, pen_blk)
            for k in chromo_pos[chromo]:
                line = self.scene.addLine(k + 40, (i * 25) + 3 + 10 * i,
                                          k + 40, (i * 25) + 22 + 10 * i,
                                          pen_red)
                temp = [
                ]  #used for storing coordinates and saving them in self.bar_coords[]
                temp.append(ind)  #index value
                temp.append(k + 40)  #x value
                temp.append((i * 25) + 3 + 10 * i)  #y1
                temp.append((i * 25) + 22 + 10 * i)  #y2
                self.bar_coords.append(temp)  #push x, y1, and y2 to this list
                ind += 1
            i = i + 1

    #creates bar graph num of repeats vs. chromsome
    #this graphs is connected to the repeats_vs_chromo.py file
    #to represent the widget space in the UI file
    def chro_bar_create(self, info):
        x1 = []
        y1 = []
        lentemp = 0
        for chromo in info[self.chromo_seed.currentText()]:
            y1.append(len(info[self.chromo_seed.currentText()][chromo]))
            x1.append(chromo)
            if (int(chromo) > lentemp):
                lentemp = int(chromo)
        #clear the old graph
        self.repeats_vs_chromo.canvas.axes.clear()
        #x_pos used to format the addition of more bars appropriately
        x_pos = [i for i, _ in enumerate(x1)]

        #loop fixes when there is too many xlabels and they start running together,
        #replaces some with an empty string to space out the labels
        if (len(x_pos) > 20):
            temp = 0
            for i in x_pos:
                if (i == 0):
                    temp += 1
                else:
                    if (temp < len(str(lentemp)) + 2):
                        x1[i] = ""
                        temp += 1
                    else:
                        temp = 0

        #the following statements are plottings / formatting for the graph
        self.repeats_vs_chromo.canvas.axes.bar(x_pos, y1, align='center')
        self.repeats_vs_chromo.canvas.axes.yaxis.set_major_locator(
            MaxNLocator(integer=True))
        self.repeats_vs_chromo.canvas.axes.set_ylim(0, max(y1) + 1)
        self.repeats_vs_chromo.canvas.axes.set_xticks(x_pos)
        self.repeats_vs_chromo.canvas.axes.set_xticklabels(x1)
        self.repeats_vs_chromo.canvas.axes.set_xlabel('Chromosome')
        self.repeats_vs_chromo.canvas.axes.set_ylabel('Number of Repeats')

        #for loop below could be used to rotae labels for spacing
        #for tick in self.repeats_vs_chromo.canvas.axes.get_xticklabels():
        #   tick.set_rotation(90)

        self.repeats_vs_chromo.canvas.draw()

    #plots the sequences per Number Repeats bar graph
    #this graph is connected to the seeds_vs_repeats_bar.py file
    #to represent the wdiget space in the UI file
    def bar_seeds_vs_repeats(self):
        data = {}
        self.average = 0
        for seed in self.parser.repeats:
            self.average += int(self.parser.repeats[seed])
            number = self.parser.repeats[seed]
            if number in data:
                data[number] += 1
            else:
                data[number] = 1
        data = self.order_high_low_rep(data)
        self.average = round(self.average / (len(self.parser.repeats)))
        holder = []
        repeats = []
        max = 0
        for number in data:
            if data[number] > max:
                max = data[number]
            if (data[number] / max) > .01:
                holder.append(data[number])
                repeats.append(number)
        #clear graph space
        self.seeds_vs_repeats_bar.canvas.axes.clear()
        #xpos used to handle appropriate formatting for more bars being added in
        x_pos = [i for i, _ in enumerate(repeats)]
        #the following are plotting / formatting for the graph
        self.seeds_vs_repeats_bar.canvas.axes.bar(x_pos, holder)
        self.seeds_vs_repeats_bar.canvas.axes.set_xticks(x_pos)
        self.seeds_vs_repeats_bar.canvas.axes.set_xticklabels(repeats)
        self.seeds_vs_repeats_bar.canvas.axes.set_xlabel('Number of Repeats')
        self.seeds_vs_repeats_bar.canvas.axes.set_ylabel('Number of Sequences')
        self.seeds_vs_repeats_bar.canvas.axes.set_title(
            'Number of Sequences per Number of Repeats')
        #rects are all the bar objects in the graph
        rects = self.seeds_vs_repeats_bar.canvas.axes.patches
        rect_vals = []
        #this for loop will calculate the height and create an annotation for each bar
        for rect in rects:
            height = rect.get_height()
            temp = self.seeds_vs_repeats_bar.canvas.axes.text(
                rect.get_x() + rect.get_width() / 2,
                height,
                '%d' % int(height),
                ha='center',
                va='bottom')
            temp.set_visible(False)
            rect_vals.append(temp)
        #function used for when user cursor is hovering over the bar, if hovering over a bar, the
        #height annotatin will appear above the bar, otherwise it will be hidden
        def on_plot_hover(event):
            i = 0
            for rect in rects:
                height = rect.get_height()
                if rect.contains(event)[0]:
                    rect_vals[i].set_visible(True)
                else:
                    rect_vals[i].set_visible(False)

                i = i + 1

            self.seeds_vs_repeats_bar.canvas.draw()

        #statement to detect cursor hovering over the bars
        self.seeds_vs_repeats_bar.canvas.mpl_connect('motion_notify_event',
                                                     on_plot_hover)
        #must redraw after every change
        self.seeds_vs_repeats_bar.canvas.draw()

    #plots the repeats per ID number graph as line graph
    #this graph is connected to the repeats_vs_seeds_line.py file
    #to represent the widget space in the UI file
    def plot_repeats_vs_seeds(self):
        data = {}
        for seed in self.parser.repeats:
            number = self.parser.repeats[seed]
            if number in data:
                data[number] += 1
            else:
                data[number] = 1

        max = 0
        y1 = []
        x1 = []
        index = 0
        time = 0
        for number in self.order(data):
            time += 1

            if int(data[number]) > max:
                max = int(data[number])
                self.mode = number

            hold = 0
            while hold < data[number]:
                if index == int(round(len(self.parser.repeats) / 2)):
                    self.median = number
                x1.append(index)
                y1.append(number)
                index = index + 1
                hold += 1

        #clear axes
        self.repeats_vs_seeds_line.canvas.axes.clear()
        #the following are for plotting / formatting
        self.repeats_vs_seeds_line.canvas.axes.plot(x1, y1)
        self.repeats_vs_seeds_line.canvas.axes.set_xlabel('Seed ID Number')
        self.repeats_vs_seeds_line.canvas.axes.set_ylabel('Number of Repeats')
        self.repeats_vs_seeds_line.canvas.axes.set_title(
            'Number of Repeats per Seed ID Number')
        #always redraw at the end
        self.repeats_vs_seeds_line.canvas.draw()

    #fills min and max dropdown windows
    def fill_min_max(self, run_seed_fill=True):
        self.ready_chromo_min_max = False
        index = 1
        self.max_chromo.clear()
        self.min_chromo.clear()
        while index < self.max_repeats + 1:
            self.min_chromo.addItem(str(index))
            self.max_chromo.addItem(str(self.max_repeats + 1 - index))
            index += 1
        self.ready_chromo_min_max = True
        if run_seed_fill:
            self.fill_seed_id_chrom()

    #fill_seed_id_chrom will fill the seed ID dropdown, and create the chromosome graph
    def fill_seed_id_chrom(self):
        if self.ready_chromo_min_max == False:
            return
        if int(self.min_chromo.currentText()) > int(
                self.max_chromo.currentText()):
            self.ready_chromo_min_max = False
            self.max_chromo.clear()
            self.min_chromo.clear()
            self.ready_chromo_min_max = True
            self.fill_min_max(False)
            QtWidgets.QMessageBox.question(
                self, "Maximum cant be less than Minimum",
                "The Minimum number of repeats cant be more than the Maximum",
                QtWidgets.QMessageBox.Ok)
            self.fill_seed_id_chrom()
            return
        self.ready_chromo_make_graph = False
        self.chromo_seed.clear()
        any = False
        seqLength = int(self.sq.endo_info[self.endo_drop.currentText()][1])
        for seed in self.parser.repeats:
            if self.parser.repeats[seed] >= int(self.min_chromo.currentText(
            )) and self.parser.repeats[seed] <= int(
                    self.max_chromo.currentText()):
                any = True
                #temp = self.sq.compress(seed,64)
                self.chromo_seed.addItem(
                    str(
                        self.sq.decompress64(seed,
                                             slength=seqLength,
                                             toseq=True)))
        if any == False:
            QtWidgets.QMessageBox.question(
                self, "No matches found",
                "No seed that is within the specifications could be found",
                QtWidgets.QMessageBox.Ok)
            self.ready_chromo_min_max = False
            self.max_chromo.clear()
            self.min_chromo.clear()
            self.ready_chromo_min_max = True
            self.fill_min_max(False)
            self.fill_seed_id_chrom()
            return
        self.ready_chromo_make_graph = True
        self.chro_bar_data()

    def order(self, data_par):
        data = dict(data_par)
        data2 = []
        while len(data) > 0:
            max = 0
            for item in data:
                if item > max:
                    max = item
            data2.append(max)
            if len(data2) == 1:
                self.max_repeats = max
            del data[max]
        return data2

    def order_high_low_rep(self, dictionary):
        data = dict(dictionary)
        data_ordered = {}
        while len(data) > 0:
            max = 0
            max_index = 0
            for item in data:

                if data[item] > max:
                    max_index = item
                    max = data[item]

            data_ordered[max_index] = max

            del data[max_index]
        return data_ordered

    #connects to view->CASPER to switch back to the main CASPER window
    def changeto_main(self):
        GlobalSettings.mainWindow.show()
        self.hide()

    #connects to go back button in bottom left to switch back to the main CASPER window
    def go_back(self):
        GlobalSettings.mainWindow.show()
        self.hide()

    #-----------------------NOT USED----------------------------#
    def get_instances(self):
        ST = SeqTranslate()
        os.chdir(path)
        f = open(self.file_name, 'r')
        while True:
            x = f.readline()
            if x == 'REPEATS\n':
                print("reached repeat sequences")
                break
        while True:
            t = f.readline()
            if t == 'END_OF_FILE':
                print("reached end of repeat sequences")
                break
            ukey = t[:-1]  # takes away the "\n" in the string
            key = ST.decompress64(ukey, slength=20, toseq=True)
            key = ST.fill_As(key, 16)
            self.BAD_instances[key] = list()
            # Add sequences and locations to the list
            v = f.readline().split('\t')[:-1]
            for item in v:
                loctup = item.split(',')
                chrom = loctup[0]
                location = ST.decompress64(loctup[1])
                seq = ST.decompress64(loctup[2][1:], slength=20, toseq=True)
                seq = ST.fill_As(
                    seq, 4
                )  # when A's get lost in the compression this fills them back in
                mytup = (chrom, location, seq)
                self.BAD_instances[key].append(mytup)
        f.close()
        print("currently sorting")
        for key in self.BAD_instances:
            size = len(self.BAD_instances[key])
            newtuple = (key, self.BAD_instances[key], size
                        )  # sequence, location, size
            self.sorted_instances.append(newtuple)

    #not used
    # Returns the container self.sorted_instances but removes all "single" repeats. Old Code to fix an off-by-1 error
    def return_all_seqs(self):
        myseqs = []
        for instance in self.sorted_instances:
            if instance[2] > 1:
                myseqs.append(instance)
        return myseqs

    #not used
    def return_sorted(self):
        sorted_seqs = sorted(self.sorted_instances,
                             key=operator.itemgetter(2),
                             reverse=True)
        amounts = {}
        for instance in sorted_seqs:
            if instance[2] > 1:
                if instance[2] in amounts:
                    amounts[instance[2]] += 1
                else:
                    amounts[instance[2]] = 1
                print(
                    str(instance[0]) + "," + str(instance[2]) + "," +
                    str(instance[1]))
        for element in amounts:
            print("Number of seed sequences with " + str(element) +
                  " appearances: " + str(amounts[element]))

    #not used
    def return_positions(self):
        positions_mapped = [
        ]  # chromosme, beginning of range, end of range, and number of hits
        for instance in self.sorted_instances:
            if instance[2] > 1:
                for pos in instance[1]:
                    chrom = pos[0]
                    loc = int(pos[1])
                    # check to see if its already in the map
                    need_new = True
                    for position in positions_mapped:
                        if chrom == position[0]:
                            if position[1] < loc < position[2]:
                                position[3] += 1
                                position[4].append(instance[0])
                                need_new = False
                                print("position added")
                    if need_new:
                        newtuple = [
                            chrom, loc - 1000, loc + 1000, 1,
                            [" ", instance[0]]
                        ]
                        positions_mapped.append(newtuple)
        sorted_positions = sorted(positions_mapped,
                                  key=operator.itemgetter(3),
                                  reverse=True)
        for element in sorted_positions:
            print(
                str(element[0]) + "," + str(element[1]) + "," +
                str(element[2]) + "," + str(element[3]))
        for element in sorted_positions:
            sequences = ""
            for sequence in element[4]:
                sequences += sequence + ","
            print(sequences)
        return sorted_positions

    #not used
    def int_to_char(self, i):
        switcher = {0: 'A', 1: 'T', 2: 'C', 3: 'G'}
        return switcher[i]

    # ----------------------------------------------------------#

    # this function calls the closingWindow class.
    def closeEvent(self, event):
        GlobalSettings.mainWindow.closeFunction()
        event.accept()
コード例 #5
0
class CSPRparser:
    #default ctor: currently just sets the file name and initializes all of the variables I will be using
    def __init__(self, inputFileName):

        # variables used in this class
        self.multiSum = 0  #multitargetting sum taken from the previous version of make_graphs
        self.multiCount = 0  #multitargetting count taken from the previous version of make_graphs
        self.seqTrans = SeqTranslate(
        )  #SeqTranslate variable. for decrompressing the data
        self.chromesomeList = list(
        )  # list of a list for the chromesomes. As it currently stands, this variable is used in both read_chromesomes and in read_targets
        self.karystatsList = list(
        )  # list of (ints) of the karyStats (whatever those are) to be used for the get_chrom_length function
        self.genome = ""  # genome name
        self.misc = ""  # anything from the misc line
        self.repeats = {
        }  #dictionary of the number of repeats. See the read_repeats function for more info
        self.seeds = {
        }  #dictionary of which chromesomes are repeats. See the read_repeats function for more info
        self.dec_tup_data = {}
        self.chromesomesSelectedList = list()
        # data for population analysis
        # dict:
        # key = the seed
        #       value = tuple (org name, chom #, location, sequence, pam, score, strand, endo)
        self.popData = {}

        #file path variable
        self.fileName = inputFileName

    # this is the parser that is used for the gen_lib window
    # it returns a list of lists, essentially all of the chromosomes in the file, and their data
    # to make it faster, this now uses read_targets
    def gen_lib_parser(self, genDict, endo):
        retDict = dict()

        #for item in genDict:
        #   retList.append((list()))

        for gene in genDict:
            retDict[gene] = list()
            retDict[gene] = self.read_targets(
                '', (genDict[gene][0], genDict[gene][1], genDict[gene][2]),
                endo)
        return retDict

    #this function reads the first 3 lines of the file: also stores the karyStats in a list of ints
    def read_first_lines(self):
        fileStream = open(self.fileName, 'r')

        #read and parse the genome line
        self.genome = fileStream.readline()
        colonIndex = self.genome.find(':') + 2
        buffer1 = self.genome[colonIndex:]
        self.genome = buffer1

        #read and store the karystats line on its own, it is parsed down below
        buffer = fileStream.readline()

        #read and parse the misc line
        self.misc = fileStream.readline()
        colonIndex = self.misc.find(':') + 2
        buffer1 = self.misc[colonIndex:]
        self.misc = buffer1

        #now parse the karystats line
        #ignore the first bit of the string. only care about what's after the colon
        colonIndex = buffer.find(':') + 2

        #parse the line, store the numbers in the list
        for i in range(colonIndex, len(buffer)):
            bufferString1 = ""
            if buffer[i] == ',':
                bufferString1 = buffer[colonIndex:i]
                #print(bufferString1)
                colonIndex = i + 1
                self.karystatsList.append(int(bufferString1))

        fileStream.close()
        #print(self.karystatsList)

    # this function gets the chromesome names out of the CSPR file provided
    # returns the gene line, and the misc line as well
    # also stores the Karystats
    def get_chromesome_names(self):
        self.read_first_lines()
        self.chromesomesSelectedList.clear()

        fileStream = open(self.fileName, 'r')

        retGen = fileStream.readline()
        junk = fileStream.readline()
        retMisc = fileStream.readline()

        buffer = fileStream.readline()

        while True:  # breaks out when the buffer line = REPEATS
            if buffer == 'REPEATS\n':
                break
            elif '>' in buffer:
                self.chromesomesSelectedList.append(buffer)
            buffer = fileStream.readline()

        return retGen, retMisc

#this function reads all of the chromosomes in the file
#stores the data into a list of lists. So the line starting with '>' is the first index of each sub list

    def read_chromesome(self, endo):
        self.chromesomeList.clear()
        tempList = list()
        fileStream = open(self.fileName, 'r')

        #ignore the first 3 lines
        fileStream.readline()
        fileStream.readline()
        fileStream.readline()

        bufferString = fileStream.readline()
        while (True):  #this loop breaks out when bufferString is REPEATS
            tempList.append(bufferString)

            if (bufferString == "REPEATS\n"):
                break
            bufferString = fileStream.readline()
            while (True):  #this loop breaks out when bufferString[0] is >
                if (bufferString == "REPEATS\n"):
                    self.chromesomeList.append(tempList)
                    tempList = []
                    break

                elif (
                        bufferString[0] == '>'
                ):  #if we get to the next chromesome, append the tempList, clear it, and break
                    self.chromesomeList.append(tempList)
                    tempList = []
                    break
                else:  #else decompress the data, and append it to the list
                    bufferString = self.seqTrans.decompress_csf_tuple(
                        bufferString, endo=endo)
                    tempList.append(bufferString)
                    #print(bufferString)
                    bufferString = fileStream.readline()
        fileStream.close()

########################################################################################################
#    this function reads just the repeats
#    it stores this data in 2 dictionaries:
#        repeats dictionary is the number of dictionaries
#               key = the seed, and the value is the number of repeats
#        seeds dictionary is each seed that is repeated
#           key =  the seeds, and the value is the actual chromesome that is repeated
#    this function also stores the sum and count in the class itself as well
#    this function is very similar to what make_graphs in Multitargeting.py was doing before
########################################################################################################

    def read_repeats(self, endoChoice):
        index = 0

        seedLength = int(self.seqTrans.endo_info[endoChoice][1])

        #clear what is already in there
        self.repeats.clear()
        self.seeds.clear()

        # only read the repeats section of the file
        fileStream = open(self.fileName, 'r')
        buf = fileStream.readline()
        while buf != "REPEATS\n":
            buf = fileStream.readline()
        split_info = fileStream.read().split('\n')
        fileStream.close()

        #parse the info now and store it in the correct dictionaries
        while (index + 1 < len(split_info)):
            seed = self.seqTrans.decompress64(split_info[index],
                                              slength=seedLength)
            repeat = split_info[index + 1].split("\t")

            self.repeats[seed] = 0
            self.seeds[seed] = []
            self.dec_tup_data[seed] = []
            for item in repeat:
                #print(self.seqTrans.decompress_csf_tuple(item, endo=endoChoice, bool=True))
                if item != "":
                    self.repeats[seed] += 1
                    sequence = item.split(',')
                    self.seeds[seed].append(sequence)
                    temp = sequence[1:4]

                    #print(seed)
                    #print(str(self.seqTrans.compress(seed,64)))
                    #print(temp[1])

                    #temp[1] = str(self.seqTrans.compress(seed,64)) + str(temp[1])
                    #print(temp)

                    temp.append(
                        str(
                            self.seqTrans.decompress64(
                                seed, toseq=True, slength=int(seedLength))))
                    #print(temp)
                    string = ",".join(temp)
                    #print(string)
                    #print('\t', self.seqTrans.decompress_csf_tuple(string, bool=True, endo=endoChoice))
                    self.dec_tup_data[seed].append(
                        self.seqTrans.decompress_csf_tuple(string,
                                                           bool=True,
                                                           endo=endoChoice))
                    self.multiSum += self.seqTrans.decompress64(
                        sequence[3], slength=seedLength)
                    self.multiCount += 1

            index = index + 2

    # this function takes a list of all the file names
    # it finds the repeats for each file, and also checks to see if those repeats are in each file, not just the first
    # stores the data in a class object
    def popParser(self, cspr_file, endoChoice):
        self.popData.clear()
        seedLength = self.seqTrans.endo_info[endoChoice][1]

        referenceList = list()

        # skip the junk
        file_stream = open(cspr_file, 'r')
        genomeLine = file_stream.readline()
        file_stream.readline()

        # parse the genome line
        genomeLine = genomeLine.split(',')
        retNumber = int(genomeLine[len(genomeLine) - 1])

        # parse the miscalleneous line and get the data we want out of it
        misc_line = file_stream.readline()
        colonIndex = misc_line.find(':') + 2
        usefulData = misc_line[colonIndex:]

        usefulData = usefulData.split('|')
        usefulData.pop()

        i = 0
        while i < len(usefulData):
            temp = usefulData[i].split(',')
            referenceList.append((temp[0], temp[1]))
            i += 1

        buf = file_stream.readline()
        while buf != 'REPEATS\n':
            buf = file_stream.readline()

        split_info = file_stream.read().split('\n')
        file_stream.close()

        index = 0
        while (index + 1 < len(split_info)):
            # get the seed and repeat line
            seed_d = self.seqTrans.decompress64(split_info[index],
                                                slength=int(seedLength),
                                                toseq=True)
            repeat = split_info[index + 1].split('\t')

            # if the seed is not in the dict, put it in there
            if seed_d not in self.popData:
                self.popData[seed_d] = list()

            for item in repeat:
                if item != '':
                    commaIndex = item.find(',')
                    chrom = item[:commaIndex]
                    sequence = item.split(',')
                    temp = sequence[1:4]
                    temp.append(str(seed_d))
                    string = ",".join(temp)
                    tempTuple = self.seqTrans.decompress_csf_tuple(
                        string, bool=True, endo=endoChoice)
                    orgName = referenceList[int(chrom) - 1][0]

                    storeTuple = (
                        orgName,
                        chrom,
                        tempTuple[0],
                        tempTuple[1],
                        tempTuple[2],
                        tempTuple[3],
                        tempTuple[4],
                        tempTuple[5],
                    )

                    self.popData[seed_d].append(storeTuple)

            index += 2

        return retNumber, referenceList
        """
        # for each file given
        for count in range(len(file_list)):

            # open the file and get the orgName
            fileStream = open(file_list[count], 'r')
            buf = fileStream.readline()
            colonIndex = buf.find(':')
            orgName = buf[colonIndex + 2:]
            orgName = orgName.replace('\n', '')
            print(orgName)

            # now skip until the repeats section
            while buf != 'REPEATS\n':
                buf = fileStream.readline()

            # read the whole repeats section
            split_info = fileStream.read().split('\n')
            fileStream.close()

            index = 0
            seedLength = self.seqTrans.endo_info[endoChoice][1]
            while (index + 1 < len(split_info)):
                # get the seed and repeat line
                seed_d = self.seqTrans.decompress64(split_info[index], slength=int(seedLength), toseq=True)
                repeat = split_info[index + 1].split("\t")

                # if the seed is not in the dict, put it in there
                if seed_d not in self.popData:
                    self.popData[seed_d] = list()


                # go through and append each line
                for item in repeat:
                    if item != "":
                        # get the chromosome number
                        commaIndex = item.find(',')
                        chrom = item[:commaIndex]
                        # from read_repeats
                        sequence = item.split(',')
                        temp = sequence[1:4]
                        temp.append(str(seed_d))
                        string = ",".join(temp)
                        tempTuple = self.seqTrans.decompress_csf_tuple(string, bool=True, endo=endoChoice)

                        # store what we need
                        storeTuple = (orgName, chrom,  tempTuple[0], tempTuple[1], tempTuple[2], tempTuple[3], tempTuple[4], tempTuple[5],)
                        #storeTuple = (orgName, chrom, temp)

                        # append it
                        self.popData[seed_d].append(storeTuple)
                index += 2
            split_info.clear()
        """

    #this function just reads the whole file
    def read_all(self):
        print("Reading First Lines.")
        self.read_first_lines()
        print("Reading Chromesomes.")
        self.read_chromesome()
        print("Reading Repeats.")
        self.read_repeats()

    #this functions reads the entirety of the file into one string
    def get_whole_file(self):
        fileStream = open(self.fileName)
        fileData = fileStream.read()
        fileStream.close()
        return (fileData)

    #this function reads all of the targets in the file. It is essentially a copy of get_targets from the results.py file, written by Brian Mendoza
    def read_targets(self, genename, pos_tuple, endo):
        #open the file, and store the genome and the misc tags.
        #Note: The KARYSTATS is not stored at all. This should not be hard to implement if it is needed
        fileStream = open(self.fileName)
        self.genome = fileStream.readline()
        fileStream.readline()
        retList = list()
        self.misc = fileStream.readline()

        header = fileStream.readline()

        # get the sequence length for the decompressor
        seqLength = self.seqTrans.endo_info[endo][2]
        # Find the right chromosome:
        while True:
            # quick error check so the loop eventually breaks out if nothing is found
            if header == "":
                print("Error: the target could not be found in this file!")
                break
            # in the right chromosome/scaffold?
            if header.find("(" + str(pos_tuple[0]) + ")") != -1:
                while True:
                    # Find the appropriate location by quickly decompressing the location at the front of the line
                    myline = fileStream.readline()
                    if self.seqTrans.decompress64(
                            myline.split(",")[0],
                            slength=seqLength) >= pos_tuple[1]:
                        while self.seqTrans.decompress64(
                                myline.split(",")[0],
                                slength=seqLength) < pos_tuple[2]:
                            retList.append(
                                self.seqTrans.decompress_csf_tuple(myline,
                                                                   endo=endo))
                            myline = fileStream.readline()
                    else:
                        continue
                    break
                break
            else:
                header = fileStream.readline()
        fileStream.close()

        return retList

    def uniq_seq_count(self):
        self.unique_targets = 0
        for chromo in self.chromesomeList:
            for data in chromo:
                if len(data) == 6:
                    self.unique_targets += 1
        return self.unique_targets
コード例 #6
0
ファイル: Comparison.py プロジェクト: Teico88/CASPER
class Compare_Orgs:
    def __init__(self, output_path, base_org_path, base_org, endo,
                 other_genomes):
        # initialize SeqTranslate object
        self.ST = SeqTranslate()
        self.output_path = output_path
        # my_orgs contains just the
        self.organisms = other_genomes
        self.organisms.append(base_org)
        self.organisms = sorted(self.organisms)
        self.db_path = base_org_path[:base_org_path.find(base_org)]

        # Dictionary of dictionaries. Key1: generic total sequence Key2: org Value: position
        self.searchableseqs = {}

        # Container that stores all the sequences seen the combination of organisms defined by the key
        # An example key would be (sce, yli) for the shared sequences between S.cerevisiae and Y.lipolytica
        self.buckets = {}

        # Intitialize the self.buckets container to contain the tuple of every organism subset
        for i in range(2, len(self.organisms)):
            for subset in itertools.combinations(self.organisms, i):
                self.buckets[subset] = []
                print(subset)
        self.endo = endo

        # The object that is iterated over to decompress the output into readable form
        self.compressed_output = {}

        # Generates the sequence lists
        for org in self.organisms:
            print(org)
            self.make_lists(org)

        # Runs the comparison
        self.create_comparison()
        self.write_to_file()

    # Takes an organism and parses the target data into positions and repeated sequences containers
    def make_lists(self, org):
        name1 = self.db_path + org + self.endo + ".cspr"
        f = open(name1, 'r')
        curchrom = int()
        while True:
            position = f.readline()
            if position.find("CHROMOSOME") != -1:
                curchrom = position[position.find("#") + 1:-1]
                print(curchrom)
            else:
                if position[0:-1] == "REPEATS":
                    break
                # adds to the positions container the unique position with organism, and chromosome as keys
                line = position[:-1].split(",")
                # change line into generic (no "+" or "-" change to generic .)
                totseq = self.ST.to_generic_compressed(line[1])
                self.add_to_sequence_matrix(totseq, org, curchrom, line[0])

        while True:
            seedseq = f.readline()[:-1]
            if seedseq.find("END_OF_FIL") != -1:
                break
            taillocs = f.readline().split('\t')[:-1]
            for item in taillocs:
                loctup = item.split(',')
                totseq = self.ST.to_generic_compressed(
                    [seedseq, loctup[2][1:]])
                self.add_to_sequence_matrix(totseq, org, loctup[0], loctup[1])
        f.close()

    # Takes in the variables of a sequence including what organism it is found on and adds it to the dict of dicts
    # named:self.searchableseqs
    def add_to_sequence_matrix(self, totseq, org, chrom, location):
        if totseq in self.searchableseqs.keys():
            # already seen this organism and sequence
            if org in self.searchableseqs[totseq].keys():
                self.searchableseqs[totseq][org].append((chrom, location))
            # already seen this sequence but not this sequence in the organism
            else:
                self.searchableseqs[totseq][org] = []
                self.searchableseqs[totseq][org].append((chrom, location))
        # new organism and new sequence
        else:
            self.searchableseqs[totseq] = {}
            self.searchableseqs[totseq][org] = []
            self.searchableseqs[totseq][org].append((chrom, location))

    def int_to_char(self, i):
        switcher = {0: 'A', 1: 'T', 2: 'C', 3: 'G'}
        return switcher[i]

    def revcom(self, sequence):
        revseq = ""
        change = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
        for nt in sequence:
            if nt in change:
                rnt = change[nt]
            else:
                rnt = nt
            revseq = rnt + revseq
        return revseq

    def create_comparison(self):
        # Put every sequence in the appropriate bucket
        tempdict = dict()
        for sequence in self.searchableseqs:
            # Look for the set of organisms containing this sequence
            if len(self.searchableseqs[sequence].keys()) > 1:
                # Make sure the tuple is in the right order
                orgs = self.searchableseqs[sequence].keys()
                orgs = tuple(sorted(orgs))
                # iterate through all the sequences contained in each organism
                for org in self.searchableseqs[sequence].keys():
                    tempdict[org] = []
                    for location in self.searchableseqs[sequence][org]:
                        tempdict[org].append(location)
                insert_tup = (sequence, tempdict)
                tempdict = {}
                # contains a list of tuples with the sequence then short dictionary of organisms containing sequence
                self.buckets[orgs].append(insert_tup)

    def write_to_file(self):
        filename = self.output_path + "compare_"
        for org in self.organisms:
            filename += org + "_"
        filename += self.endo + ".txt"
        f = open(filename, 'w')
        for key in self.buckets:
            f.write(str(key) + " " + str(len(self.buckets[key])) + "\n")
            for seq in self.buckets[key]:
                f.write(self.ST.decompress64(seq[0], True) + "\n")
                for suborg in seq[1]:
                    f.write(str(suborg) + ":")
                    for locs in seq[1][suborg]:
                        f.write(
                            str(locs[0]) + "," +
                            str(self.ST.decompress64(locs[1])) + "\t")
                        f.write("\n")
            f.write("\n")
        f.close()