Ejemplo n.º 1
0
 def __init__(self, database: str, strand: str, mismatch: int, cas9: str):
     self.root = os.path.dirname(os.path.abspath("../main.py"))
     os.chdir(self.root)
     self.sql = SQL(database=database)
     self.strand = strand
     self.mismatch = mismatch
     self.cas9 = cas9
Ejemplo n.º 2
0
    def grna_display_runner(self):
        display_grna = DisplayGuideRNA(database_list=self.availible_databases, cas9_list=self.availible_cas9)

        if display_grna.exec_():

            holder = PandasModel(pd.DataFrame({'': []}))
            self.display_candidates.setModel(holder)
            self.display_backup.setModel(holder)
            self.display_dropped.setModel(holder)
            self.display_offtargets.setModel(holder)
            self.database_querried = False

            self.statusBar().showMessage("Preparing ...")
            self.main_progressbar_value += 1
            self.main_progressbar.setValue(self.main_progressbar_value)

            user_options = display_grna.out()
            if "temp" not in os.listdir(self.root):
                tempdir = os.path.join(self.root, "temp")
                os.mkdir(os.path.join(tempdir))
            else:
                tempdir = os.path.join(self.root, "temp")

            database = str(user_options['organism']).replace(" ", "_")
            mismatch = user_options['max_mismatch']
            max_grna = user_options['max_grna_count']
            max_primer_len = user_options['max_primer_len']
            user_cas9 = user_options['cas9']
            user_pam_tolerance = user_options['pam_tolerance']
            user_fiveprime = user_options['nucleotides_5']
            user_threeprime = user_options['nucleotides_3']

            gene_mask_dictionary = {
                'genes': [items.replace("_", "").lower() if "_" in items else items.lower() for items in
                          user_options['genes']],
                'masks': [items.replace("_", "").lower() if "_" in items else items.lower() for items in
                          user_options['masks']]
            }

            sqlrunner = SQL(database=database)
            headers = sqlrunner.custom_sql("SELECT header FROM genes").to_dict('list')

            gene_check = [True if gene in headers['header'] else False for gene in gene_mask_dictionary['genes']]
            mask_check = [True if gene in headers['header'] else False for gene in gene_mask_dictionary['masks']]

            for idx, val in enumerate(gene_check):
                if not val:
                    db = database.replace("_", " ")
                    QtWidgets.QMessageBox.about(self, "Error",
                                                f"{gene_mask_dictionary['genes'][idx]} was not found in {db}")

                    self.main_progressbar_value = 0
                    self.main_progressbar.setValue(self.main_progressbar_value)
                    return None

            for idx, val in enumerate(mask_check):
                if not val:
                    db = database.replace("_", " ")
                    QtWidgets.QMessageBox.about(self, "Error",
                                                f"{gene_mask_dictionary['masks'][idx]} was not found in {db}")

                    self.main_progressbar_value = 0
                    self.main_progressbar.setValue(self.main_progressbar_value)
                    return None

            if mismatch == "":
                QtWidgets.QMessageBox.about(self, "Error",
                                            "First search guide RNA's")

                self.main_progressbar_value = 0
                self.main_progressbar.setValue(self.main_progressbar_value)
                return None

            # Strand is r for reverse
            worker = CrisprInterference_worker(database=database,
                                               mismatch=mismatch,
                                               strand='r',
                                               max_grna=max_grna,
                                               genes_masks=gene_mask_dictionary,
                                               max_primer_size=max_primer_len,
                                               cas9_organism=user_cas9,
                                               pam_tolerance=user_pam_tolerance,
                                               fiveprime_nucleotides=user_fiveprime,
                                               threeprime_nucleotides=user_threeprime)

            self.threadingPool.start(worker)

            while self.threadingPool.activeThreadCount() == 1:
                self.statusBar().showMessage("Gathering guide RNA's...")
                QtWidgets.QApplication.processEvents()

                if self.main_progressbar_value < 90:
                    self.main_progressbar_value += 1
                    self.main_progressbar.setValue(self.main_progressbar_value)
                    time.sleep(0.8)

            if self.threadingPool.waitForDone():
                self.statusBar().showMessage("Gathering data ...")
                self.candidate_gRNA_df = pd.read_csv(
                    filepath_or_buffer=os.path.join(self.root, "temp", "candidates.txt"),
                    sep=",")
                self.backup_gRNA_df = pd.read_csv(filepath_or_buffer=os.path.join(self.root, "temp", "backup.txt"),
                                                  sep=",")
                self.dropped_gRNA_df = pd.read_csv(filepath_or_buffer=os.path.join(self.root, "temp", "dropped.txt"),
                                                   sep=",")
                self.offtarget_df = pd.read_csv(filepath_or_buffer=os.path.join(self.root, "temp", "offtargets.txt"),
                                                sep=",")

                cand_model, backup_model, dropped_model, offtargets_model = map(PandasModel, [self.candidate_gRNA_df,
                                                                                              self.backup_gRNA_df,
                                                                                              self.dropped_gRNA_df,
                                                                                              self.offtarget_df])

                while self.main_progressbar_value < 100:
                    self.main_progressbar_value += 1
                    self.statusBar().showMessage("Formatting for display...")
                    self.main_progressbar.setValue(self.main_progressbar_value)
                    time.sleep(0.01)

                self.display_candidates.setModel(cand_model)
                self.display_backup.setModel(backup_model)
                self.display_dropped.setModel(dropped_model)
                self.display_offtargets.setModel(offtargets_model)
                self.database_querried = True
                self.main_progressbar_value = 0
                self.main_progressbar.setValue(self.main_progressbar_value)
                self.statusBar().showMessage("Ready")

                hits = [genes for genes in self.candidate_gRNA_df['genes']]
                missed = list(set(gene_mask_dictionary['genes']) - set(hits))

                EOSpopup(missed_genes=missed).exec_()

            shutil.rmtree(tempdir)
Ejemplo n.º 3
0
    def grna_search_runner(self):
        """
        search guide RNA's
        :return:
        """
        search_grna = SearchGrnaDialog(database_list=self.availible_databases, cas9_list=self.availible_cas9)
        if search_grna.exec_():
            user_data = search_grna.out()

            chosen_org = user_data['organism']
            is_circular = user_data['chromosome']
            mismatch = user_data['mismatch']
            pam = user_data['pam']
            tax_id = user_data['taxonomy_id']
            cores = user_data['cores']

            if is_circular not in {"TRUE", "FALSE"}:
                QtWidgets.QMessageBox.about(self, "Error", "chromosome needs to be TRUE or FALSE")
                return None

            sql_runner = SQL(database=chosen_org.replace(" ", "_"))
            searched_pams = sql_runner.list_pams()

            if pam in searched_pams:
                QtWidgets.QMessageBox.about(self, "Error",
                                            f"PAM {pam} has already been searched, choose a different pam")
                return None

            if tax_id == "":
                QtWidgets.QMessageBox.about(self, "Error",
                                            f"Taxonomy id required")
                return None

            self.statusBar().showMessage("Creating guide RNA database...")
            self.main_progressbar_value += 1
            self.main_progressbar.setValue(self.main_progressbar_value)

            if not "temp" in os.listdir(self.root):
                os.mkdir(os.path.join(self.root, "temp"))

            tempdir = os.path.join(self.root, "temp")

            if "global_gRNA" not in tempdir:
                os.mkdir(os.path.join(tempdir, "global_gRNA"))

            global_gRNA = os.path.join(tempdir, "global_gRNA")

            chosen_org_modified = chosen_org.replace(" ", "_")
            strain = chosen_org.split(" ")[-1]

            self.main_progressbar_value += 1
            self.main_progressbar.setValue(self.main_progressbar_value)

            tempfa = pd.read_sql("SELECT * FROM genome", sqlite3.connect(os.path.join(self.root,
                                                                                      "databases",
                                                                                      f"{chosen_org_modified}.db")))

            tempgff = pd.read_sql("SELECT * FROM gff_file", sqlite3.connect(os.path.join(self.root,
                                                                                         "databases",
                                                                                         f"{chosen_org_modified}.db")))

            tempgenes = pd.read_sql("SELECT * FROM genes", sqlite3.connect(os.path.join(self.root,
                                                                                        "databases",
                                                                                        f"{chosen_org_modified}.db")))

            tempgff.to_csv(f"{os.path.join(tempdir, chosen_org_modified)}.gff", header=False, index=False, sep="\t")

            fasta_header = [header for header in tempfa['header']]
            fasta_sequence = [sequence for sequence in tempfa['sequence']]

            self.main_progressbar_value += 1
            self.main_progressbar.setValue(self.main_progressbar_value)

            with open(os.path.join(tempdir, f"{chosen_org_modified}.fasta"), 'w') as input_fasta:
                for header, sequence in zip(fasta_header, fasta_sequence):
                    input_fasta.write(">" + header + '\n')
                    input_fasta.write(sequence + '\n')

            self.main_progressbar_value += 1
            self.main_progressbar.setValue(self.main_progressbar_value)

            fasta_header.clear()
            fasta_sequence.clear()

            fasta_header = [header for header in tempgenes['header']]
            fasta_sequence = [sequence for sequence in tempgenes['sequence']]

            with open(os.path.join(tempdir, f"{chosen_org_modified}_genes.fasta"), 'w') as input_fasta:
                for header, sequence in zip(fasta_header, fasta_sequence):
                    input_fasta.write(">" + header + '\n')
                    input_fasta.write(sequence + '\n')

            self.main_progressbar_value += 1
            self.main_progressbar.setValue(self.main_progressbar_value)

            fasta_header.clear()
            fasta_sequence.clear()

            for objects in self.availible_bsgenome:
                detected_package = objects.split("_")[0]
                detected_package = detected_package.split(".")[-1]
                if strain == detected_package:
                    bsgenome_package = str(objects.split("_")[0])

            genus, species, strain = chosen_org.split(" ")
            species = species.lower()

            config_file = {
                'organism': [f"{genus} {species} {strain}"],
                'taxonomy_id': [tax_id],
                'circular_chromosome': [is_circular],
                'input_file': [os.path.join(tempdir, f"{chosen_org_modified}_genes.fasta")],
                'gff_file': [os.path.join(tempdir, f"{chosen_org_modified}.gff")],
                'find_gRNA_with_cutsites': ["FALSE"],
                'find_paired_gRNA': ["FALSE"],
                'BSgenome': [bsgenome_package],
                'chromosomes_to_search': ["all"],
                'min_gap': [0],
                'max_gap': [20],
                'gRNA_size': [20],
                'max_mismatch_gRNA': [mismatch],
                'PAM_sequence': [pam],
                'PAM_length': [len(pam)],
                'n.cores': [cores],
                'scoring_method': ["CFDscore"]
            }

            config_file = pd.DataFrame(config_file)
            config_file.to_csv(os.path.join(tempdir, "config.txt"), index=False, sep="\t")

            findgRNA_worker = FindgRNA_worker()

            self.threadingPool.start(findgRNA_worker)

            while self.threadingPool.activeThreadCount() == 1:
                self.statusBar().showMessage("predicting all guide RNA's...")
                QtWidgets.QApplication.processEvents()
                if self.main_progressbar_value <= 90:
                    self.main_progressbar_value += 1
                    self.main_progressbar.setValue(self.main_progressbar_value)
                    time.sleep(2)

            if self.threadingPool.waitForDone():
                self.statusBar().showMessage("buidling global gRNA database...")
                database = Database(database=chosen_org_modified)
                database.create_gRNA_database(summary=os.path.join(global_gRNA, "Summary.xls"),
                                              offtarget=os.path.join(global_gRNA, "OfftargetAnalysis.txt"),
                                              config_file=os.path.join(tempdir, "config.txt"))

                while self.main_progressbar_value <= 100:
                    self.main_progressbar_value += 1
                    self.main_progressbar.setValue(self.main_progressbar_value)

                shutil.rmtree(tempdir)
                self.main_progressbar_value = 0
                self.main_progressbar.setValue(self.main_progressbar_value)
                self.statusBar().showMessage("Ready")
Ejemplo n.º 4
0
 def run(self):
     sqlrunner = SQL(database=self.database)
     out = sqlrunner.custom_sql(statement=self.sql_query)
     out.to_csv(os.path.join(self.root, "temp", "query.txt"), header=True, index=False, sep=",")
Ejemplo n.º 5
0
    def run(self):
        sqlrunner = SQL(database=os.path.join(self.root, "databases", self.database))
        gRNA_db = sqlrunner.get_global_gRNA(mismatch=str(self.mismatch))

        # This is a rate limiting step
        if bool(self.gene_mask_dict['genes']):
            query_data = self.get_targeted_data(dataframe=gRNA_db, gene_mask_dict=self.gene_mask_dict)
        else:
            query_data = gRNA_db

        multifasta = sqlrunner.get_gene_multifasta()

        gRNA_runner = RefineCripri(grna_dataframe=query_data,
                                   strand=self.strand,
                                   fasta_dataframe=multifasta,
                                   cas9=self.cas9_organism,
                                   offtarget_ids=sqlrunner.custom_sql("SELECT name, strand FROM global_offtarget"))

        candidates, backup, dropped = gRNA_runner.cripr_interference()

        candidates, backup, dropped = map(self.utils.annotate_dataframe,
                                          [candidates, backup, dropped])

        offtargets = sqlrunner.get_offtargets_by_mismatch(mismatch=self.mismatch)
        offtargets.dropna(subset=['annotation'], inplace=True)
        offtargets = offtargets[offtargets['strand'] != '+']
        offtargets['annotation'] = offtargets['annotation'].apply(
            lambda x: x.replace("_", "") if isinstance(x, str) else x)

        offtargets = offtargets.query("gene != annotation")
        offtargets.reset_index(drop=True, inplace=True)

        offtarget_ids = list(set(offtargets['name']))
        candidates_has_offtargets = self.list_comparison(list1=candidates['names'], list2=offtarget_ids)
        backup_has_offtargets = self.list_comparison(list1=backup['names'], list2=offtarget_ids)

        if candidates_has_offtargets:
            candidate_off_ids = list(set(candidates['names']) & set(offtarget_ids))

            candidates_offtargets = self.grab_offtargets(query=candidates,
                                                         offtargets=offtargets,
                                                         offtarget_ids=offtarget_ids)

            candidates = self.negate_pam_mismatch(grna_dataframe=candidates,
                                                  offtarget_dataframe=candidates_offtargets,
                                                  target_ids=candidate_off_ids)

            candidates, dropped = self.move_grna_by_offtargets(grna_dataframe=candidates,
                                                               dropped_dataframe=dropped,
                                                               offtarget_dataframe=candidates_offtargets,
                                                               masks=self.gene_mask_dict['masks'])

            candidates_offtargets = pd.DataFrame(candidates_offtargets)
        else:
            candidates_offtargets = dict.fromkeys(offtargets, [])
            candidates_offtargets = pd.DataFrame(candidates_offtargets)

        if backup_has_offtargets:
            backup_off_ids = list(set(backup['names']) & set(offtarget_ids))

            backup_offtargets = self.grab_offtargets(query=backup, offtargets=offtargets, offtarget_ids=offtarget_ids)

            backup = self.negate_pam_mismatch(grna_dataframe=backup,
                                              offtarget_dataframe=backup_offtargets,
                                              target_ids=backup_off_ids)

            backup, dropped = self.move_grna_by_offtargets(grna_dataframe=backup,
                                                           dropped_dataframe=dropped,
                                                           offtarget_dataframe=backup_offtargets,
                                                           masks=self.gene_mask_dict['masks'])

            backup_offtargets = pd.DataFrame(backup_offtargets)

        else:
            backup_offtargets = dict.fromkeys(offtargets, [])
            backup_offtargets = pd.DataFrame(backup_offtargets)

        ## add ranking to pam, move between dataframes if ranking is f****d
        candidates, backup = self.scan_maxmismatches(candidates=candidates, backup=backup)
        candidates, backup = self.force_max_grna_in_candidates(candidates=candidates, backup=backup,
                                                               max_grna=self.max_grna)

        candidates = self.force_ag_base(dataframe=candidates, max_primer_size=self.max_primer_size)

        backup = self.force_ag_base(dataframe=backup, max_primer_size=self.max_primer_size)

        candidates, backup, dropped = map(self.calculate_primer_len, [candidates, backup, dropped])

        candidates, backup, dropped = map(self.calculate_gc_content, [candidates, backup, dropped])

        candidates = self.design_primers(dataframe=candidates, cas9=self.cas9_organism,
                                         fiveprime=self.fiveprime, threeprime=self.threeprime)

        backup = self.design_primers(dataframe=backup, cas9=self.cas9_organism,
                                     fiveprime=self.fiveprime, threeprime=self.threeprime)

        candidates, backup, dropped = map(pd.DataFrame,
                                          [candidates, backup, dropped])

        offtarget_empty = [candidates_offtargets.empty, backup_offtargets.empty]

        final_offtargets = pd.DataFrame()

        if not all(offtarget_empty):
            final_offtargets = candidates_offtargets
            final_offtargets['from'] = "candidates"
            backup_offtargets['from'] = "backup"
            final_offtargets = final_offtargets.append(backup_offtargets, ignore_index=True)

        else:
            if not offtarget_empty[0]:
                final_offtargets = candidates_offtargets
                final_offtargets['from'] = "candidates"

            if not offtarget_empty[1]:
                final_offtargets = backup_offtargets
                final_offtargets['from'] = "backup"

        if final_offtargets.empty:
            final_offtargets = pd.DataFrame(columns=offtargets.columns)

        candidates.to_csv(os.path.join(self.root, "temp", "candidates.txt"), header=True, index=False, sep=",")
        backup.to_csv(os.path.join(self.root, "temp", "backup.txt"), header=True, index=False, sep=",")
        dropped.to_csv(os.path.join(self.root, "temp", "dropped.txt"), header=True, index=False, sep=",")
        final_offtargets.to_csv(os.path.join(self.root, "temp", "offtargets.txt"), header=True, index=False, sep=",")
Ejemplo n.º 6
0
    def force_ag_base(self, dataframe, max_primer_size):
        """
		Algorithm matches the grna to the gene and increments the bases if they are not a/g this modifies the index of gRNA in place
		"""
        sqlrunner = SQL(database=os.path.join(self.root, "databases", self.database))
        for idx, genes in enumerate(dataframe['genes']):
            gene_sequence = str(sqlrunner.get_gene_sequence(gene=genes)).lower()

            complement_strand_dict = {
                'g': 'c',
                'G': 'C',
                'a': 't',
                'A': 'T',
                'c': 'g',
                'C': 'G',
                't': 'a',
                'T': 'A'
            }

            grna = str(dataframe['gRNA'][idx]).lower()
            fails_contstraint = False if grna[0] == 'a' or grna[0] == 'g' else True

            sequence_swapped = ""

            if fails_contstraint:
                for base in gene_sequence:
                    sequence_swapped += complement_strand_dict.get(base, "N")

                grna = grna[::-1]
                pam_len = len(str(dataframe['PAM'][idx]).lower())
                loc_in_gene = sequence_swapped.find(grna)
                primer_wo_pam_start = loc_in_gene + pam_len
                primer_wo_pam_stop = primer_wo_pam_start + 20  # 20 is the primer length, fixed value

                if primer_wo_pam_stop + max_primer_size < len(gene_sequence):
                    counter = 0
                    while counter <= max_primer_size:
                        target_base_location = primer_wo_pam_stop + counter
                        target_base = sequence_swapped[target_base_location]
                        if target_base == "a" or target_base == "g":
                            grna_out = sequence_swapped[loc_in_gene:target_base_location + 1]
                            grna_out = grna_out[::-1]
                            dataframe['gRNA'][idx] = grna_out.upper()
                            dataframe['score'][idx] = dataframe['score'][idx] - 0.5

                            if "position 20 is" in dataframe['notes'][idx]:
                                dataframe['notes'][idx] = "PASS"

                            break
                        else:
                            counter += 1
                else:
                    counter = 1
                    if primer_wo_pam_stop == len(gene_sequence):
                        grna_out = sequence_swapped[loc_in_gene:len(gene_sequence)]
                        grna_out = grna_out[::-1]
                        dataframe['gRNA'][idx] = grna_out.upper()
                        dataframe['score'][idx] = dataframe['score'][idx] - 0.5
                        if "position 20 is" in dataframe['notes'][idx]:
                            dataframe['notes'][idx] = "PASS"
                    else:

                        while counter < len(gene_sequence):
                            target_base_location = primer_wo_pam_stop + counter
                            target_base = sequence_swapped[target_base_location]
                            if target_base == "a" or target_base == "g":
                                grna_out = sequence_swapped[loc_in_gene:target_base_location + 1]
                                grna_out = grna_out[::-1]
                                dataframe['gRNA'][idx] = grna_out.upper()
                                dataframe['score'][idx] = dataframe['score'][idx] - 0.5

                                if "position 20 is" in dataframe['notes'][idx]:
                                    dataframe['notes'][idx] = "PASS"
                                break
                            else:
                                counter += 1

        return dataframe
Ejemplo n.º 7
0
class CrisprFuncHelpers:
    """
    unittest class for the crispr.py
    """
    def __init__(self, database: str, strand: str, mismatch: int, cas9: str):
        self.root = os.path.dirname(os.path.abspath("../main.py"))
        os.chdir(self.root)
        self.sql = SQL(database=database)
        self.strand = strand
        self.mismatch = mismatch
        self.cas9 = cas9

    def initial_filter_test(self):
        data = self.sql.get_global_gRNA(mismatch=self.mismatch)
        genes = [genes.split("_")[0] for genes in data['names']]
        data['genes'] = genes
        query = ["Rv0899", "Rv0934"]
        out = pd.DataFrame()
        for items in query:
            if items in genes:
                grad_idx = [
                    idx for idx, val in data.iterrows()
                    if items in val['genes']
                ]
                out = out.append(data.loc[grad_idx, :], ignore_index=True)

        runner = RefineCripri(grna_dataframe=out,
                              strand=self.strand,
                              fasta_dataframe=None,
                              cas9=self.cas9)

        candidates, backup, dropped = map(pd.DataFrame,
                                          *[runner.initial_filter()])

        candidates_out = list(
            set([
                True if row['score'] < 2 else False
                for _, row in candidates.iterrows()
            ]))
        backup_out = list(
            set([
                True if row['score'] >= 2 else False
                for _, row in backup.iterrows()
            ]))
        dropped_out = list(
            set([
                True if row['names'][-1] != self.strand else False
                for _, row in dropped.iterrows()
            ]))

        return [candidates_out[0], backup_out[0], dropped_out[0]]

    def initial_filter_result(self):
        return [True, True, True]

    def has_offtarget_test(self):
        data = self.sql.get_global_gRNA(mismatch=self.mismatch)
        genes = [genes.split("_")[0] for genes in data['names']]
        data['genes'] = genes
        query = ["Rv0899", "Rv0934", "Rv0051"]
        out = pd.DataFrame()
        for items in query:
            if items in genes:
                grad_idx = [
                    idx for idx, val in data.iterrows()
                    if items in val['genes']
                ]
                out = out.append(data.loc[grad_idx, :], ignore_index=True)

        runner = RefineCripri(grna_dataframe=out,
                              strand=self.strand,
                              fasta_dataframe=None,
                              cas9=self.cas9)

        candidates, backup, dropped = runner.initial_filter()
        candidates, backup, dropped = runner.has_offtarget(
            candidates=candidates, backup=backup, dropped_gRNA=dropped)
        candidates, backup, dropped = map(pd.DataFrame,
                                          [candidates, backup, dropped])
        return True

    def has_offtarget_result(self):
        return True