class LimmaVoom(object): def __init__(self, count, group, repl, out): """ Inite object Ebseq :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._logfc_column = 2 self._pvalue_column = 5 self._logfc = 2 self._pvalue = 0.05 def run_de(self, gene): de = 0 lfc = float(gene[self._logfc_column]) pv = float(gene[self._pvalue_column]) if lfc >= self._logfc or lfc <= -self._logfc: if pv >= self._pvalue: de = 1 return de def run_limmavoom(self): """ Execute default analysis with Limma-voom :return: """ if self._replic == 1: self._message.message_4( "limma-voom require more than one replics.") self._message.message_9("--- limma-voom: is kipped!") else: try: robjects.r('library("' + 'edgeR' + '")') robjects.r('library("' + 'limma' + '")') ct = 'table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE)' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') res = robjects.r('nf = calcNormFactors(m, method = "TMM")') grup = "" for ind in iter(self._groups_name): grup = grup + ('"' + ind + '",') * self._replic grup = grup[:(len(grup) - 1)] robjects.r('condition = factor(c(' + grup + '))') res = robjects.r( 'voom.data <- voom(m, model.matrix(~factor(condition)), lib.ize = colSums(m) * nf)' ) res = robjects.r('voom.data$genes = rownames(m)') res = robjects.r( 'voom.fitlimma = lmFit(voom.data, design=model.matrix(~factor(condition)))' ) res = robjects.r('voom.fitbayes = eBayes(voom.fitlimma)') res = robjects.r('voom.pvalues = voom.fitbayes$p.value[, 2]') res = robjects.r( 'voom.adjpvalues = p.adjust(voom.pvalues, method="BH")') var = 'design <- c(' + '1,' * self._replic + '2,' * self._replic var = var[:(len(var) - 1)] + ')' res = robjects.r(var) res = robjects.r( 'data <- topTable(voom.fitbayes, coef=ncol(design), number=1000000)' ) wr = 'write.table(data, file="' + self._output + '", sep = "\t", quote = FALSE)' robjects.r(wr) self._message.message_9("--- limma-voom: is completed!") except RRuntimeError as rre: self._message.message_9("Error in limma-voom execution: " + str(rre)) raise rre
class EdgeR(object): def __init__(self, count, group, repl, out): """ Define the edgeR object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._column_result = [3,4] self._min_result = [] self._message = Message() self._logfc_colum = 1 self._pvalue_colum = 3 self._pvalue = 0.05 self._logfc = 2 def run_de(self, gene): de = 0 lfc = float(gene[self._logfc_colum]) pv = float(gene[self._pvalue_colum]) if lfc >= self._logfc or lfc <= -self._logfc: if pv >= self._pvalue: de = 1 return de def run_edger(self): """ Execute default analysis with edegeR :return: """ try: finish_message = "" res = robjects.r('library("limma")') res = robjects.r('library("edgeR")') ct = 'table <- read.csv("' \ + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE, sep = "' + "," + '")' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') grup = "" assert isinstance(self._replic, int) for ind in iter(self._groups_name): aux = "'" + ind + "', " grup = grup + aux * self._replic grup = grup[:(len(grup) - 2)] grup = 'group <- c(' + grup + ')' res = robjects.r(grup) res = robjects.r('y.dge <- DGEList(counts = m, group = group)') if (self._replic < 1): self._message.message_4(" Replicates not found by edgeR. EdgeR should be executed manual form.") elif (self._replic == 1): # edgeR manual based solution for without replicates res = robjects.r('bcv <- 0.2') res = robjects.r('y.et <- exactTest(y.dge, dispersion = bcv^2)') res = robjects.r('y.tp <- topTags(y.et, n = 100000)') res = robjects.r('y.pvalues <- y.et$table$PValue') wr = 'write.table(y.tp$table, "' + self._output + '", sep = "\t", quote = FALSE)' res = robjects.r(wr) finish_message = "--- edgeR without replicates is completed!" else: r('y.dge <- calcNormFactors(y.dge)') r('y.dge <- estimateDisp(y.dge)') r('y.dge <- estimateCommonDisp(y.dge)') r('y.et <- exactTest(y.dge)') r('y.tp <- topTags(y.et, n = 100000)') r('y.pvalues <- y.et$table$PValue') wr = 'write.table(y.tp$table, "' + self._output + '", sep = "\t", quote = FALSE)' r(wr) finish_message = "--- edgeR with replicates is completed!" self._message.message_9(finish_message) except RRuntimeError as rre: self._message.message_9("Error in edgeR execution: " + str(rre)) raise rre
class Experiment(object): """ Business object of Experiment """ _count: CountBo def __init__(self): self._exp_dao = None self._reference = None self._transcript = False self._count = None self._expression = None self._mapp_bo = None self.message = Message() self._fastq = [] self._out_mapp = [] self._count_table = [] self._merged_table_out = None self._edger = None self._bayseq = None self._deseq = None self._noiseq = None self._ebseq = None self._samseq = None self._limmavoom = None def init_experiment(self, exp, file): """ Iniatialize experiment :param exp: :param file: config file :return: """ assert isinstance(exp, ExperimentDao) self._exp_dao = exp self._exp_dao.read_configuration_file(file) self._exp_dao._name = self.name_valid(self._exp_dao._name) self.rep_valid(self._exp_dao._replic) self.group_number_valid(self._exp_dao._group_number) ref = self._exp_dao._reference if self._exp_dao._reference == "": print( "You don't have a refserence genome... Expression analyse need a table count with mapping reads" ) self._merged_table_out = input("Type absolute path to table count") elif ref != "" and (self.extension_valid(ref, "fa") or self.extension_valid(ref, "fasta")): self._reference = self._exp_dao._reference # == ref else: self.message.message_3("REFERENCE FILE ") exit(0) self.directory_valid(self._exp_dao._read_directory, "reads") for i in iter(self._exp_dao._group_directory): reads = self._exp_dao._read_directory + "/" + str(i) self.directory_valid(reads, "group") # Get fastq reads path_find = self._exp_dao._read_directory + "/" + i + "/" self._fastq.append(self.get_reads_file(path_find)) if self._exp_dao._paired_end == True: self.message.message_8( "The sequence is paired-end. CONSEXPRESSION dont make paired-end analysis" ) exit(0) else: self.message.message_8("The sequence is single-end") def name_valid(self, name): """ Verify the name: if is empty change to default name :param name: name of experiment :return: boolean """ if len(name) == 0: name = "consexpression" self.message.message_7( "Experiment name is empty! The name was changed to consexpression" ) return name def rep_valid(self, rep): """ Verify if number of replicates technical and biological is valid (>= 1). basestring :param rep_t: basestring :param rep_b: void :return: """ ok = False if rep >= 1: self.message.message_1("replics") else: self.message.message_2( "1 replic or more (technique or biological)") self.message.message_3("number of replics in line 5 - 6") exit() def extension_valid(self, path, extension): """ Verify if the extension file is the expected :param path: :param extension: :return: boolean """ var_ret = False if str.endswith(path, extension): var_ret = True else: var_ret = False return var_ret def directory_valid(self, path, type): """ Verify if path is a directory :param path: path of file :param type: file is reference genome, reads? :return: void """ ok = os.path.isdir(path) if ok: self.message.message_1("directory " + type + ": " + path) else: self.message.message_2("a valid directory " + type + " path") self.message.message_3(" the directory " + type + " path (line 9)") exit() def group_number_valid(self, group_n): """ Verify the number of groups. The minimal is one int :param group_n: void :return: """ assert isinstance(group_n, int) if group_n >= 1: self.message.message_1("group number") else: self.message.message_2("1 group or more.") self.message.message_3("the number of gruoups in line 7") exit() def file_valid(self, path): """ Verify if path is a file basestring :param path: void :return: """ if os.path.isfile(path): self.message.message_1(" file: " + path) else: self.message.message_2(" a valid reference file") self.message.message_3(" the reference file path (line 7)") exit() def exceute_mapp_count(self): """ Execute Tophat and htseq-count :return: """ ref = self._reference thread = self._exp_dao._threads sing = self._exp_dao._paired_end n = self._exp_dao._name path_find = [] for grp in iter(self._fastq): for grp_file in iter(grp): bar = 1 + grp_file.rfind('/') out_mapp = grp_file[:bar] + n + "/" + grp_file[bar:] dir = grp_file[:bar] + n if os.path.isdir(dir): pass else: os.mkdir(dir, 0o755) out_mapp = out_mapp.replace('fastq', 'sam') path_find.append(out_mapp) mapp_vo = MappVo("TopHat", ref, grp_file, "", thread, out_mapp, "", sing) self._mapp_bo = MappBo(mapp_vo) mapp_exe = self._mapp_bo.execute_mapp() if mapp_exe == 0: dot = out_mapp.rfind('.') in_type = out_mapp[dot + 1:] bar = 1 + out_mapp.rfind('/') table_count = out_mapp[bar:dot] table_count = out_mapp[:bar] + table_count + "_table_count.txt" self._count_table.append(table_count) in_count = out_mapp + "/accepted_hits.sam" count_vo = CountVo(in_count, self._exp_dao._annotation_file, self._exp_dao._annotation_type, in_type, self._exp_dao._count_mode, table_count) self._count = CountBo(count_vo) if self._count.execute_count() == 0: self.message.message_8("Count Sucsessfull!!!") else: self.message.message_4( "Error in counting mapped reads...") else: self.message.message_4( "Task: Mapping don't run correctly.") self._out_mapp.append(path_find) def get_reads_file(self, dir): """ Get all fastq path of dir :param dir: path to folder of fastq sample :return: array of reads file path """ fastq_file = [] path = dir + "*.fastq" #serach for file in glob.glob(path): fastq_file.append(file) if len(fastq_file) == 0: self.message.message_7("*Not found files FASTQ in directorie " + dir) return fastq_file def execute_expression_analysis(self): """ Make analysis with counts data for mapping :return: """ print("Expression analisys start...") n = "consexpression" out_merge_table = "" if self._exp_dao._reference != "": out_merge_table = self._exp_dao._read_directory + "/" + self._exp_dao._name + "_table_count.txt" self.execute_merge_table(self._count_table, out_merge_table) else: out_merge_table = self._merged_table_out # 1 ------------------ edgeR ----------------- print("---- edgeR START! ------------") out_expression = self._exp_dao._output + "/" + self._exp_dao._name out_edger = out_expression + "_edger.csv" self._edger = EdgeR(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_edger) self._edger.run_edger() # 2 ------------- BaySeq -------------------- print("---- baySeq START! ------------") out_bayseq = out_expression + "_baySeq.csv" self._bayseq = BaySeq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_bayseq) self._bayseq.run_bayseq() # 3 ------------- DESeq -------------------- print("---- DESeq START! ------------") out_deseq = out_expression + "_DESeq.csv" self._deseq = DESeq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_deseq) self._deseq.run_deseq() # 4 ------------- NOISeq -------------------- print("---- NOISeq START! ------------") out_noiseq = out_expression + "_NOISeq.csv" self._noiseq = Noiseq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_noiseq) self._noiseq.run_noiseq() # 5 ------------- EBSeq -------------------- print("---- EBSeq START! ------------") out_ebseq = out_expression + "_EBSeq.csv" self._ebseq = Ebseq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_ebseq) self._ebseq.run_ebseq() # 6 ------------- SAMSeq -------------------- print("---- SAMSeq START! ------------") # out_samseq = out_expression + "_SAMSeq.csv" # self._samseq = SamSeq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_samseq) # self._samseq.run_samseq() # 7 ------------- limma-voom -------------------- print("---- limma START! ------------") out_limmavoom = out_expression + "_limmavoom.csv" self._limmavoom = LimmaVoom(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_limmavoom) self._limmavoom.run_limmavoom() def execute_conseus(self, out): gene_de = {} read_bay = open(self._bayseq._output, 'r') c_b = 0 for line in iter(read_bay): if c_b > 0: gene = line.split("\t") v = self._bayseq.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_bay.close() # ---- edger read_edger = open(self._edger._output, 'r') c_b = 0 for line in iter(read_edger): if c_b > 0: gene = line.split("\t") v = self._edger.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_edger.close() #--- deseq read_deseq = open(self._deseq._output, 'r') c_b = 0 for line in iter(read_deseq): if c_b > 0: gene = line.split("\t") v = self._deseq.run_de(gene) if gene[1] in gene_de: aux = gene_de[gene[1]] gene_de[gene[1]] = int(aux) + int(v) else: gene_de[gene[1]] = int(v) c_b += 1 read_deseq.close() # --- noiseq read_noiseq = open(self._noiseq._output, 'r') c_b = 0 for line in iter(read_noiseq): if c_b > 0: gene = line.split(",") v = self._noiseq.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_noiseq.close() # --- samseq if self._samseq is None: print("SAMSeq results not found") else: read_samseq = open(self._samseq._output, 'r') c_b = 0 for line in iter(read_samseq): if c_b > 0: gene = line.split("\t") v = self._samseq.run_de(gene) if gene[1] in gene_de: aux = gene_de[gene[1]] gene_de[gene[1]] = int(aux) + int(v) else: gene_de[gene[1]] = int(v) c_b += 1 read_samseq.close() # --- limma if self._exp_dao._replic >= 2: read_limma = open(self._limmavoom._output, 'r') c_b = 0 for line in iter(read_limma): if c_b > 0: gene = line.split("\t") v = self._limmavoom.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_limma.close() else: print("limma require more than one replics") # --- ebseq read_ebseq = open(self._ebseq._output, 'r') c_b = 0 for line in iter(read_ebseq): if c_b > 0: gene = line.split("\t") v = self._ebseq.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_ebseq.close() #--- write results header = 'gene, indications' out_cons = open(out, 'w') out_cons.write(header) names = gene_de.keys() print(len(names)) for i in iter(names): if (gene_de[i]) >= 4: out_cons.write("\n" + i + "," + str(gene_de[i])) def execute_merge_table(self, out_mapp_list, out_name): """ Make a merge table whit counts :param out_mapp_list: :param out_name: :return: """ self._count.merge_table_count(out_mapp_list, out_name, self._exp_dao._group_name)
class CountBo(object): """ This object define business rules to make count table execution """ def __init__(self, count): """ Test the doc of constructor class :param count: """ assert isinstance(count, CountVo) self._counter = count self.message = Message() def annotation_format(self): """ Verify format of annotation file (default: GTF | GFF) :return: void """ bar = self._counter.annotation_file.rfind('.') name = self._counter.annotation_file[bar:] if name != 'gtf' and name != 'gff': self.message.message_4('File extension of annotation file can be only GTF or GFF.') def execute_count(self): """ Execute command htseq-count :return: int subprocess """ text = self._counter.to_string() return_code = subprocess.call(text, shell=True) return return_code def merge_table_count(self, list_file, out, groups_name): """ Make a table whit count of all samples :param list_file: array count files :param out: text file line (gene) column (sample) data (count mapped) :param groups_name: treatment of samples :return: """ n_g = len(groups_name) group_count = 0 rep = int(len(list_file) / n_g) rep_count = 1 out_file = None gene = {} no_genes = {'__no_feature': 0, '__ambiguous': 0, '__too_low_aQual': 0, '__not_aligned': 0,'__alignment_not_unique': 0, 'not_aligned':0, 'no_feature':0, 'ambiguous':0, 'too_low_aQual':0, 'alignment_not_unique':0} out_file = open(out, 'w') out_file.write("gene") # loop table count by samples for ind in iter(list_file): op = open(ind, 'r') if rep_count <= rep: out_file.write("," + groups_name[group_count] + str(rep_count)) else: rep_count = 1 group_count += 1 out_file.write("," + groups_name[group_count]+str(rep_count)) for line in iter(op): line = line.rstrip() text = line.split("\t") if text[0] in no_genes: pass else: if text[0] in gene: aux = gene[text[0]] aux = aux + ',' + text[1] gene[text[0]] = aux else: gene[text[0]] = text[1] op.close() rep_count += 1 names = gene.keys() for i in iter(names): out_file.write("\n" + i + "," + str(gene[i])) out_file.close()
class MappBo(object): """ This class make rules of validate information and command, to execute Mapp tools """ def __init__(self, mapp): assert isinstance(mapp, MappVo) self._map_vo = mapp self._reads_file = [] self.message = Message() def threads_conf(self, threads_vo): """ Alter threads larger to default of system :param threads_vo: number of threads :return: void """ threads_sys = multiprocessing.cpu_count() if threads_vo < threads_sys: self._map_vo._threads_value = threads_sys - 1 self.message.message_9("The threads nunber defined is " + str(threads_vo) + ", but the system have only " + str(threads_sys)) self.message.message_9("---> Number of threads was change to " + str(threads_sys - 1)) self.message.message_9("Successful! Threads configuration is ok!") def execute_mapp(self): """ Execute the command: 0 is ok, 1 is fail mapped task :return: int """ self.threads_conf(self._map_vo._threads_value) n = self.make_bowtie2_index(self._map_vo._index_name) self._map_vo._index_name = n text = self._map_vo.to_string() return_code = subprocess.call(text, shell=True) return return_code def make_bowtie2_index(self, index): """ Execute command to make a bowtie2 index if do not exists :param index: fasta file reference to mapp :return: name of generated index """ dot = index.rfind('.f') name = index[:dot] if os.path.isfile(name + ".1.bt2"): return name else: command = "bowtie2-build " + index + " " + name if subprocess.call(command, shell=True) == 0: return name else: self.message.message_4("Error in index build") return "" # #===== TESTES DA CLASSE ===================== # name = "Bowtie2" # index_name = "/home/juliana/Documents/Projeto_Juliana/Datasets/Referencias/GRCh38.p5/GCA_000001405.20_GRCh38.p5_genomic.fna" # threads_value = 3 # reads1_name = "/home/juliana/Documents/Eliandro-UEL/E1_S1_L001_R1_001_prinseq_1.fastq" # reads2_name = "/home/juliana/Documents/Eliandro-UEL/E1_S1_L001_R2_001_prinseq_2.fastq" # output_name = "/home/juliana/Documents/Testes_RNATool/eliandro_uel.sam" # map = vo.MappVo.MappVo(name,index_name,reads1_name, reads2_name, threads_value,output_name,"",False) # mapbo = MappBo(map) # mapbo.make_bowtie2_index(index_name) # # map.parm_mapp() # # teste = map.to_string() # # print teste # # print teste
class MappVo(object): """ Record values to run Mapp methos """ #mapp_vo = MappVo("TopHat", ref, grp_file, "", thread, out_mapp, "", sing) def __init__(self, name, index, read1_n, read2_n, threads, out, other, single_end): """ Meka a first construction of object with param by user :param name: :param index: :param reads_dir: :param threads: :param out: :param other: :param single_end: """ self._index_parm = "" self._reads1_parm = "" self._reads2_parm = "" self._threads_parm = "" self._output_parm = "" self._command_parm = "" self._sep = " " self._name = name #nIndex = index[(1 + index.rfind('/')):] # (index.rfind('.')) self._index_name = index # nIndex self._reads1_name = read1_n self._reads2_name = read2_n self._threads_value = threads self._output_name = out self._output_type = "--no-convert-bam " self._other_conf = other self._paired_end = single_end self._message = Message() self.parm_mapp() def parm_mapp(self): """ Make command parameter by mapping tool """ if self._name == "BWA": self._command_parm = "bwa mem " self._threads_parm = "-t " self._output_parm = "> " elif self._name == "Bowtie2": self._command_parm = "bowtie2 " self._index_parm = "-x " self._threads_parm = "-p " self._output_parm = "-S " if self._paired_end == 'True': self._reads1_parm = "-1 " self._reads2_parm = "-2 " else: self._reads1_parm = "-U " elif self._name == "TopHat": self._command_parm = "tophat2 " self._threads_parm = "-p " self._output_parm = "--output-dir " else: self._message.message_4("Mapping " + self._name + " not found!") exit() def to_string(self): """ Return a command, this command used to run a Mapping tool str :return: """ aux = "" if self._name == "Bowtie2": aux = self._command_parm + self._index_parm + self._index_name + self._sep aux = aux + self._threads_parm + str(self._threads_value) + self._sep aux = aux + self._reads1_parm + self._reads1_name + self._sep if self._paired_end == 'True': aux = aux + self._reads2_parm + self._reads2_name + self._sep aux = aux + self._output_parm + self._output_name + self._sep aux = aux + self._other_conf return aux elif self._name == "BWA": aux = self._command_parm + self._index_parm + self._index_name + self._sep aux = aux + self._threads_parm + str(self._threads_value) + self._sep aux = aux + self._reads1_parm + self._reads1_name + self._sep if self._paired_end == 'True': aux = aux + self._reads2_parm + self._reads2_name + self._sep aux = aux + self._output_parm + self._output_name + self._sep aux = aux + self._other_conf return aux elif self._name == "TopHat": aux = self._command_parm aux = aux + self._threads_parm + str(self._threads_value) + self._sep aux = aux + self._output_type aux = aux + self._other_conf aux = aux + self._output_parm + self._output_name + self._sep #print(aux + self._index_name) aux = aux + self._index_parm + self._index_name + self._sep aux = aux + self._reads1_parm + self._reads1_name + self._sep if self._paired_end == 'True': aux = aux + self._reads2_parm + self._reads2_name + self._sep return aux else: return aux