def __init__(self): self._message = Message() self._file_conf = None self._name_par = "NAME" self._replic_parm = "REPLIC" self._group_number_parm = "GROUP_NUMBER" self._group_name_parm = "GROUP_NAMES" self._reference_parm = "REFERENCE_GENOME" self._read_directory_parm = "READS_DIRECTORY" self._group_directory_parm = "GROUP_DIRECTORIES" self._paired_end_parm = "PAIRED_END" self._threads_parm = "THREADS" self._count_mode_parm = "MODE" self._annotation_file_parm = "ANOTATION_FILE" self._annotation_type_parm = "ANOTATION_TYPE" self._output_parm = "OUTPUT" self._name = "" self._replic = [] self._group_number = 0 self._group_name = [] self._reference = "" self._read_directory = "" self._group_directory = [] self._paired_end = False self._threads = 0 self._count_mode = "" self._annotation_file = "" self._annotation_type = "" self._output = ""
def __init__(self, name, index, read1_n, read2_n, threads, out, other, single_end): """ Meka a first construction of object with param by user :param name: :param index: :param reads_dir: :param threads: :param out: :param other: :param single_end: """ self._index_parm = "" self._reads1_parm = "" self._reads2_parm = "" self._threads_parm = "" self._output_parm = "" self._command_parm = "" self._sep = " " self._name = name #nIndex = index[(1 + index.rfind('/')):] # (index.rfind('.')) self._index_name = index # nIndex self._reads1_name = read1_n self._reads2_name = read2_n self._threads_value = threads self._output_name = out self._output_type = "--no-convert-bam " self._other_conf = other self._paired_end = single_end self._message = Message() self.parm_mapp()
def __init__(self, count): """ Test the doc of constructor class :param count: """ assert isinstance(count, CountVo) self._counter = count self.message = Message()
class Ebseq(object): def __init__(self, count, group, repl, out): """ Inite object Ebseq :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._exp_column = 1 self._exp = "DE" def run_de(self, gene): de = 0 if gene[self._exp_column] == self._exp: de = 1 return de def run_ebseq(self): """ Execute default analysis with EBSeq :return: """ try: robjects.r('library("' + 'EBSeq' + '")') ct = 'table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE)' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') grup = "" for ind in iter(self._groups_name): aux = "'" + ind + "', " grup = aux + grup grup = grup[:(len(grup) - 2)] # siz = 'data(m)' # robjects.r(siz) siz = 'Sizes=MedianNorm(m)' robjects.r(siz) ct = 'EBOut=EBTest(Data=m, ' \ 'Conditions=as.factor(rep(' \ 'c(' + grup + '),each=' + str(self._replic) + ')), sizeFactors=Sizes, maxround=5)' robjects.r(ct) ct = 'EBDERes=GetDEResults(EBOut, FDR=0.05)' robjects.r(ct) wr = 'write.table(EBDERes$Status, file="' + self._output + '", sep = "\t", quote = FALSE)' robjects.r(wr) self._message.message_9("--- EBSeq: is completed!") except RRuntimeError as rre: self._message.message_9("Error in baySeq execution: " + str(rre)) raise rre
def __init__(self, count, group, repl, out): """ Inite object Ebseq :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._exp_column = 1 self._exp = "DE"
def __init__(self, count, group, repl, out): """ Define the NOISeq object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._likelihood_column = len(group) + 3 self._likelihood = 0.95
def __init__(self, count, group, repl, out): """ Inite object Ebseq :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._logfc_column = 2 self._pvalue_column = 5 self._logfc = 2 self._pvalue = 0.05
def __init__(self, count, group, repl, out): """ Define the edgeR object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._logfc_column = 6 self._pvalue_column = 7 self._pvalue = 0.05 self._logfc = 2
def __init__(self, count, group, repl, out): """ Define the edgeR object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._likelihood_column = 2 + len(group) * repl self._fdr_de_column = 4 + len(group) * repl self._likelihood = 0.95 self._fdr = 0.1
def __init__(self, count, group, repl, out): """ Inite object Ebseq :param count: :param group: :param repl: :param out: """ robjects.r['options'](warn=-1) self._table_count = count self._groups_name = group self._replic = repl self._output = out self._class = '"Two class unpaired"' self._message = Message() self._fd_column = 4 self._qvalue_column = 5 self._qvalue = 1 self._fd = 2
def __init__(self): self._exp_dao = None self._reference = None self._transcript = False self._count = None self._expression = None self._mapp_bo = None self.message = Message() self._fastq = [] self._out_mapp = [] self._count_table = [] self._merged_table_out = None self._edger = None self._bayseq = None self._deseq = None self._noiseq = None self._ebseq = None self._samseq = None self._limmavoom = None
class Experiment(object): """ Business object of Experiment """ _count: CountBo def __init__(self): self._exp_dao = None self._reference = None self._transcript = False self._count = None self._expression = None self._mapp_bo = None self.message = Message() self._fastq = [] self._out_mapp = [] self._count_table = [] self._merged_table_out = None self._edger = None self._bayseq = None self._deseq = None self._noiseq = None self._ebseq = None self._samseq = None self._limmavoom = None def init_experiment(self, exp, file): """ Iniatialize experiment :param exp: :param file: config file :return: """ assert isinstance(exp, ExperimentDao) self._exp_dao = exp self._exp_dao.read_configuration_file(file) self._exp_dao._name = self.name_valid(self._exp_dao._name) self.rep_valid(self._exp_dao._replic) self.group_number_valid(self._exp_dao._group_number) ref = self._exp_dao._reference if self._exp_dao._reference == "": print( "You don't have a refserence genome... Expression analyse need a table count with mapping reads" ) self._merged_table_out = input("Type absolute path to table count") elif ref != "" and (self.extension_valid(ref, "fa") or self.extension_valid(ref, "fasta")): self._reference = self._exp_dao._reference # == ref else: self.message.message_3("REFERENCE FILE ") exit(0) self.directory_valid(self._exp_dao._read_directory, "reads") for i in iter(self._exp_dao._group_directory): reads = self._exp_dao._read_directory + "/" + str(i) self.directory_valid(reads, "group") # Get fastq reads path_find = self._exp_dao._read_directory + "/" + i + "/" self._fastq.append(self.get_reads_file(path_find)) if self._exp_dao._paired_end == True: self.message.message_8( "The sequence is paired-end. CONSEXPRESSION dont make paired-end analysis" ) exit(0) else: self.message.message_8("The sequence is single-end") def name_valid(self, name): """ Verify the name: if is empty change to default name :param name: name of experiment :return: boolean """ if len(name) == 0: name = "consexpression" self.message.message_7( "Experiment name is empty! The name was changed to consexpression" ) return name def rep_valid(self, rep): """ Verify if number of replicates technical and biological is valid (>= 1). basestring :param rep_t: basestring :param rep_b: void :return: """ ok = False if rep >= 1: self.message.message_1("replics") else: self.message.message_2( "1 replic or more (technique or biological)") self.message.message_3("number of replics in line 5 - 6") exit() def extension_valid(self, path, extension): """ Verify if the extension file is the expected :param path: :param extension: :return: boolean """ var_ret = False if str.endswith(path, extension): var_ret = True else: var_ret = False return var_ret def directory_valid(self, path, type): """ Verify if path is a directory :param path: path of file :param type: file is reference genome, reads? :return: void """ ok = os.path.isdir(path) if ok: self.message.message_1("directory " + type + ": " + path) else: self.message.message_2("a valid directory " + type + " path") self.message.message_3(" the directory " + type + " path (line 9)") exit() def group_number_valid(self, group_n): """ Verify the number of groups. The minimal is one int :param group_n: void :return: """ assert isinstance(group_n, int) if group_n >= 1: self.message.message_1("group number") else: self.message.message_2("1 group or more.") self.message.message_3("the number of gruoups in line 7") exit() def file_valid(self, path): """ Verify if path is a file basestring :param path: void :return: """ if os.path.isfile(path): self.message.message_1(" file: " + path) else: self.message.message_2(" a valid reference file") self.message.message_3(" the reference file path (line 7)") exit() def exceute_mapp_count(self): """ Execute Tophat and htseq-count :return: """ ref = self._reference thread = self._exp_dao._threads sing = self._exp_dao._paired_end n = self._exp_dao._name path_find = [] for grp in iter(self._fastq): for grp_file in iter(grp): bar = 1 + grp_file.rfind('/') out_mapp = grp_file[:bar] + n + "/" + grp_file[bar:] dir = grp_file[:bar] + n if os.path.isdir(dir): pass else: os.mkdir(dir, 0o755) out_mapp = out_mapp.replace('fastq', 'sam') path_find.append(out_mapp) mapp_vo = MappVo("TopHat", ref, grp_file, "", thread, out_mapp, "", sing) self._mapp_bo = MappBo(mapp_vo) mapp_exe = self._mapp_bo.execute_mapp() if mapp_exe == 0: dot = out_mapp.rfind('.') in_type = out_mapp[dot + 1:] bar = 1 + out_mapp.rfind('/') table_count = out_mapp[bar:dot] table_count = out_mapp[:bar] + table_count + "_table_count.txt" self._count_table.append(table_count) in_count = out_mapp + "/accepted_hits.sam" count_vo = CountVo(in_count, self._exp_dao._annotation_file, self._exp_dao._annotation_type, in_type, self._exp_dao._count_mode, table_count) self._count = CountBo(count_vo) if self._count.execute_count() == 0: self.message.message_8("Count Sucsessfull!!!") else: self.message.message_4( "Error in counting mapped reads...") else: self.message.message_4( "Task: Mapping don't run correctly.") self._out_mapp.append(path_find) def get_reads_file(self, dir): """ Get all fastq path of dir :param dir: path to folder of fastq sample :return: array of reads file path """ fastq_file = [] path = dir + "*.fastq" #serach for file in glob.glob(path): fastq_file.append(file) if len(fastq_file) == 0: self.message.message_7("*Not found files FASTQ in directorie " + dir) return fastq_file def execute_expression_analysis(self): """ Make analysis with counts data for mapping :return: """ print("Expression analisys start...") n = "consexpression" out_merge_table = "" if self._exp_dao._reference != "": out_merge_table = self._exp_dao._read_directory + "/" + self._exp_dao._name + "_table_count.txt" self.execute_merge_table(self._count_table, out_merge_table) else: out_merge_table = self._merged_table_out # 1 ------------------ edgeR ----------------- print("---- edgeR START! ------------") out_expression = self._exp_dao._output + "/" + self._exp_dao._name out_edger = out_expression + "_edger.csv" self._edger = EdgeR(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_edger) self._edger.run_edger() # 2 ------------- BaySeq -------------------- print("---- baySeq START! ------------") out_bayseq = out_expression + "_baySeq.csv" self._bayseq = BaySeq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_bayseq) self._bayseq.run_bayseq() # 3 ------------- DESeq -------------------- print("---- DESeq START! ------------") out_deseq = out_expression + "_DESeq.csv" self._deseq = DESeq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_deseq) self._deseq.run_deseq() # 4 ------------- NOISeq -------------------- print("---- NOISeq START! ------------") out_noiseq = out_expression + "_NOISeq.csv" self._noiseq = Noiseq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_noiseq) self._noiseq.run_noiseq() # 5 ------------- EBSeq -------------------- print("---- EBSeq START! ------------") out_ebseq = out_expression + "_EBSeq.csv" self._ebseq = Ebseq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_ebseq) self._ebseq.run_ebseq() # 6 ------------- SAMSeq -------------------- print("---- SAMSeq START! ------------") # out_samseq = out_expression + "_SAMSeq.csv" # self._samseq = SamSeq(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_samseq) # self._samseq.run_samseq() # 7 ------------- limma-voom -------------------- print("---- limma START! ------------") out_limmavoom = out_expression + "_limmavoom.csv" self._limmavoom = LimmaVoom(out_merge_table, self._exp_dao._group_name, self._exp_dao._replic, out_limmavoom) self._limmavoom.run_limmavoom() def execute_conseus(self, out): gene_de = {} read_bay = open(self._bayseq._output, 'r') c_b = 0 for line in iter(read_bay): if c_b > 0: gene = line.split("\t") v = self._bayseq.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_bay.close() # ---- edger read_edger = open(self._edger._output, 'r') c_b = 0 for line in iter(read_edger): if c_b > 0: gene = line.split("\t") v = self._edger.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_edger.close() #--- deseq read_deseq = open(self._deseq._output, 'r') c_b = 0 for line in iter(read_deseq): if c_b > 0: gene = line.split("\t") v = self._deseq.run_de(gene) if gene[1] in gene_de: aux = gene_de[gene[1]] gene_de[gene[1]] = int(aux) + int(v) else: gene_de[gene[1]] = int(v) c_b += 1 read_deseq.close() # --- noiseq read_noiseq = open(self._noiseq._output, 'r') c_b = 0 for line in iter(read_noiseq): if c_b > 0: gene = line.split(",") v = self._noiseq.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_noiseq.close() # --- samseq if self._samseq is None: print("SAMSeq results not found") else: read_samseq = open(self._samseq._output, 'r') c_b = 0 for line in iter(read_samseq): if c_b > 0: gene = line.split("\t") v = self._samseq.run_de(gene) if gene[1] in gene_de: aux = gene_de[gene[1]] gene_de[gene[1]] = int(aux) + int(v) else: gene_de[gene[1]] = int(v) c_b += 1 read_samseq.close() # --- limma if self._exp_dao._replic >= 2: read_limma = open(self._limmavoom._output, 'r') c_b = 0 for line in iter(read_limma): if c_b > 0: gene = line.split("\t") v = self._limmavoom.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_limma.close() else: print("limma require more than one replics") # --- ebseq read_ebseq = open(self._ebseq._output, 'r') c_b = 0 for line in iter(read_ebseq): if c_b > 0: gene = line.split("\t") v = self._ebseq.run_de(gene) if gene[0] in gene_de: aux = gene_de[gene[0]] gene_de[gene[0]] = int(aux) + int(v) else: gene_de[gene[0]] = int(v) c_b += 1 read_ebseq.close() #--- write results header = 'gene, indications' out_cons = open(out, 'w') out_cons.write(header) names = gene_de.keys() print(len(names)) for i in iter(names): if (gene_de[i]) >= 4: out_cons.write("\n" + i + "," + str(gene_de[i])) def execute_merge_table(self, out_mapp_list, out_name): """ Make a merge table whit counts :param out_mapp_list: :param out_name: :return: """ self._count.merge_table_count(out_mapp_list, out_name, self._exp_dao._group_name)
class DESeq (object): """ Run DESeq analysis """ def __init__(self, count, group, repl, out): """ Define the edgeR object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._logfc_column = 6 self._pvalue_column = 7 self._pvalue = 0.05 self._logfc = 2 def run_de(self, gene): de = 0 try: lfc = float(gene[self._logfc_column]) pv = float(gene[self._pvalue_column]) if lfc >= self._logfc or lfc <= -self._logfc: if pv <= self._pvalue: de = 1 except ValueError: de = 0 return de def run_deseq(self): """ Execute default analysis with DESeq :return: """ try: res = robjects.r('library("parallel")') res = robjects.r('library("stats4")') res = robjects.r('library("BiocGenerics")') res = robjects.r('library("Biobase")') res = robjects.r('library("locfit")') res = robjects.r('library(DESeq)') res = robjects.r('library("lattice")') ct = 'table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE)' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') grup = "" b_test = "" assert isinstance(self._replic, int) for ind in iter(self._groups_name): aux = "'" + ind + "', " b_test = aux + b_test grup = grup + aux * self._replic grup = grup[:(len(grup) - 2)] b_test = b_test[:len(b_test) - 2] res = robjects.r('condition = factor( c(' + grup + '))') res = robjects.r('cds <- newCountDataSet(m, condition)') res = robjects.r('cds <- estimateSizeFactors(cds)') command = "" if (self._replic == 1): command = 'cds <- estimateDispersions(cds, method="blind", fitType="local")' # fitType="local" else: command ='cds <- estimateDispersions(cds, fitType="local")' #fitType="local" res = robjects.r(command) cm = 'res <- nbinomTest(cds, ' + b_test + ')' res = robjects.r(cm) wr = 'write.table(res, file="' + self._output + '", sep = "\t", quote = FALSE)' res = robjects.r(wr) except RRuntimeError as rre: self._message.message_9("Error in DESeq execution: " + str(rre)) raise rre self._message.message_9("--- DESeq: is completed!") # =============================== TESTES DA CLASSE ================================== # inp = '/Volumes/SD128/bioconvergencia/reads_RNApa/kallisto_quant_RNApa_apa_1B_0B.csv' # gr = ["0b", "pb"] # rp = 2 # out = 'RNApa_apa_1B_0B-consexpression_deseq.csv' # t = DESeq(inp, gr, rp, out) # t.run_deseq() # Não temos DESeq na versão necessária
class MappBo(object): """ This class make rules of validate information and command, to execute Mapp tools """ def __init__(self, mapp): assert isinstance(mapp, MappVo) self._map_vo = mapp self._reads_file = [] self.message = Message() def threads_conf(self, threads_vo): """ Alter threads larger to default of system :param threads_vo: number of threads :return: void """ threads_sys = multiprocessing.cpu_count() if threads_vo < threads_sys: self._map_vo._threads_value = threads_sys - 1 self.message.message_9("The threads nunber defined is " + str(threads_vo) + ", but the system have only " + str(threads_sys)) self.message.message_9("---> Number of threads was change to " + str(threads_sys - 1)) self.message.message_9("Successful! Threads configuration is ok!") def execute_mapp(self): """ Execute the command: 0 is ok, 1 is fail mapped task :return: int """ self.threads_conf(self._map_vo._threads_value) n = self.make_bowtie2_index(self._map_vo._index_name) self._map_vo._index_name = n text = self._map_vo.to_string() return_code = subprocess.call(text, shell=True) return return_code def make_bowtie2_index(self, index): """ Execute command to make a bowtie2 index if do not exists :param index: fasta file reference to mapp :return: name of generated index """ dot = index.rfind('.f') name = index[:dot] if os.path.isfile(name + ".1.bt2"): return name else: command = "bowtie2-build " + index + " " + name if subprocess.call(command, shell=True) == 0: return name else: self.message.message_4("Error in index build") return "" # #===== TESTES DA CLASSE ===================== # name = "Bowtie2" # index_name = "/home/juliana/Documents/Projeto_Juliana/Datasets/Referencias/GRCh38.p5/GCA_000001405.20_GRCh38.p5_genomic.fna" # threads_value = 3 # reads1_name = "/home/juliana/Documents/Eliandro-UEL/E1_S1_L001_R1_001_prinseq_1.fastq" # reads2_name = "/home/juliana/Documents/Eliandro-UEL/E1_S1_L001_R2_001_prinseq_2.fastq" # output_name = "/home/juliana/Documents/Testes_RNATool/eliandro_uel.sam" # map = vo.MappVo.MappVo(name,index_name,reads1_name, reads2_name, threads_value,output_name,"",False) # mapbo = MappBo(map) # mapbo.make_bowtie2_index(index_name) # # map.parm_mapp() # # teste = map.to_string() # # print teste # # print teste
class ExperimentDao(object): """ Object manager data of experiment """ def __init__(self): self._message = Message() self._file_conf = None self._name_par = "NAME" self._replic_parm = "REPLIC" self._group_number_parm = "GROUP_NUMBER" self._group_name_parm = "GROUP_NAMES" self._reference_parm = "REFERENCE_GENOME" self._read_directory_parm = "READS_DIRECTORY" self._group_directory_parm = "GROUP_DIRECTORIES" self._paired_end_parm = "PAIRED_END" self._threads_parm = "THREADS" self._count_mode_parm = "MODE" self._annotation_file_parm = "ANOTATION_FILE" self._annotation_type_parm = "ANOTATION_TYPE" self._output_parm = "OUTPUT" self._name = "" self._replic = [] self._group_number = 0 self._group_name = [] self._reference = "" self._read_directory = "" self._group_directory = [] self._paired_end = False self._threads = 0 self._count_mode = "" self._annotation_file = "" self._annotation_type = "" self._output = "" def read_configuration_file(self, file): """ Read file and feed class attributes, any error terminates execution :param file: path to config file :return: void """ self._message.message_9("- Reading configuration file.. ----") conf = open(file, 'r') count_line = 0 parms = {} for line in iter(conf): count_line += 1 if line[0] != "#" and line[0] != "": l = line.rstrip("\n") p = l.split(": ") if p[0] in parms: self._message.message_9("Parameter " + p[0] + " is repeated!") else: if len(p) < 2: parms[p[0]] = "" else: parms[p[0]] = p[1] if self._name_par in parms: self._name = parms[self._name_par] if self._replic_parm in parms: self._replic = int(parms[self._replic_parm]) if self._group_number_parm in parms: self._group_number = int(parms[self._group_number_parm]) if self._group_name_parm in parms: self._group_name = parms[self._group_name_parm].split(',') if self._reference_parm in parms: self._reference = parms[self._reference_parm] if self._read_directory_parm in parms: self._read_directory = parms[self._read_directory_parm] if self._group_directory_parm in parms: self._group_directory = parms[self._group_directory_parm].split( ',') if self._paired_end_parm in parms: self._paired_end = parms[self._paired_end_parm] if self._threads_parm in parms: self._threads = int(parms[self._threads_parm]) if self._count_mode_parm in parms: self._count_mode = parms[self._count_mode_parm] if self._annotation_file_parm in parms: self._annotation_file = parms[self._annotation_file_parm] if self._annotation_type_parm in parms: self._annotation_type = parms[self._annotation_type_parm] if self._output_parm in parms: self._output = parms[self._output_parm] # # # #================ TESTE DA CLASSE ===================================== # file = "dao/CONFIG_tool" # exp = ExperimentDao() # exp.read_configuration_file(file) # print "---" # print exp._name
class MappVo(object): """ Record values to run Mapp methos """ #mapp_vo = MappVo("TopHat", ref, grp_file, "", thread, out_mapp, "", sing) def __init__(self, name, index, read1_n, read2_n, threads, out, other, single_end): """ Meka a first construction of object with param by user :param name: :param index: :param reads_dir: :param threads: :param out: :param other: :param single_end: """ self._index_parm = "" self._reads1_parm = "" self._reads2_parm = "" self._threads_parm = "" self._output_parm = "" self._command_parm = "" self._sep = " " self._name = name #nIndex = index[(1 + index.rfind('/')):] # (index.rfind('.')) self._index_name = index # nIndex self._reads1_name = read1_n self._reads2_name = read2_n self._threads_value = threads self._output_name = out self._output_type = "--no-convert-bam " self._other_conf = other self._paired_end = single_end self._message = Message() self.parm_mapp() def parm_mapp(self): """ Make command parameter by mapping tool """ if self._name == "BWA": self._command_parm = "bwa mem " self._threads_parm = "-t " self._output_parm = "> " elif self._name == "Bowtie2": self._command_parm = "bowtie2 " self._index_parm = "-x " self._threads_parm = "-p " self._output_parm = "-S " if self._paired_end == 'True': self._reads1_parm = "-1 " self._reads2_parm = "-2 " else: self._reads1_parm = "-U " elif self._name == "TopHat": self._command_parm = "tophat2 " self._threads_parm = "-p " self._output_parm = "--output-dir " else: self._message.message_4("Mapping " + self._name + " not found!") exit() def to_string(self): """ Return a command, this command used to run a Mapping tool str :return: """ aux = "" if self._name == "Bowtie2": aux = self._command_parm + self._index_parm + self._index_name + self._sep aux = aux + self._threads_parm + str(self._threads_value) + self._sep aux = aux + self._reads1_parm + self._reads1_name + self._sep if self._paired_end == 'True': aux = aux + self._reads2_parm + self._reads2_name + self._sep aux = aux + self._output_parm + self._output_name + self._sep aux = aux + self._other_conf return aux elif self._name == "BWA": aux = self._command_parm + self._index_parm + self._index_name + self._sep aux = aux + self._threads_parm + str(self._threads_value) + self._sep aux = aux + self._reads1_parm + self._reads1_name + self._sep if self._paired_end == 'True': aux = aux + self._reads2_parm + self._reads2_name + self._sep aux = aux + self._output_parm + self._output_name + self._sep aux = aux + self._other_conf return aux elif self._name == "TopHat": aux = self._command_parm aux = aux + self._threads_parm + str(self._threads_value) + self._sep aux = aux + self._output_type aux = aux + self._other_conf aux = aux + self._output_parm + self._output_name + self._sep #print(aux + self._index_name) aux = aux + self._index_parm + self._index_name + self._sep aux = aux + self._reads1_parm + self._reads1_name + self._sep if self._paired_end == 'True': aux = aux + self._reads2_parm + self._reads2_name + self._sep return aux else: return aux
def __init__(self, mapp): assert isinstance(mapp, MappVo) self._map_vo = mapp self._reads_file = [] self.message = Message()
class SamSeq (object): def __init__(self, count, group, repl, out): """ Inite object Ebseq :param count: :param group: :param repl: :param out: """ robjects.r['options'](warn=-1) self._table_count = count self._groups_name = group self._replic = repl self._output = out self._class = '"Two class unpaired"' self._message = Message() self._fd_column = 4 self._qvalue_column = 5 self._qvalue = 1 self._fd = 2 def run_de(self, gene): de = 0 fd = float(gene[self._fd_column]) qv = float(gene[self._qvalue_column]) if fd <= self._fd and fd <= self._qvalue: de = 1 return de def run_samseq(self): """ Execute default analysis with SAMSeq :return: """ try: if len(self._groups_name) > 2: self._class = '"Multiclass"' robjects.r('library("'+'samr'+'")') res = robjects.r('table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE, sep = "' + ',' + '")') res = robjects.r('m <- as.matrix(table)') grup = "" for ind in iter(self._groups_name): grup = grup + '"' + ind + '",' grup = grup[:(len(grup) - 1)] cm = 'SAMseq.test = SAMseq(m, as.factor(rep(c(' cm = cm + grup + '),each=' + str(self._replic) + ')), resp.type = '+ self._class + ', geneid = rownames(m), genenames = rownames(m), nperms = 100)' #print(cm) res = robjects.r(cm) res = robjects.r('SAMseq.result.table = rbind(SAMseq.test$siggenes.table$genes.up, SAMseq.test$siggenes.table$genes.lo)') res = robjects.r('SAMseq.score = rep(0, nrow(m))') res = robjects.r('SAMseq.score[match(SAMseq.result.table[,1], rownames(m))] = as.numeric(SAMseq.result.table[,3])') res = robjects.r('SAMseq.FDR = rep(1, nrow(m))') res = robjects.r('SAMseq.FDR[match(SAMseq.result.table[,1], rownames(m))] = as.numeric(SAMseq.result.table[,5])/100') wr = 'write.table(SAMseq.result.table, file="' + self._output + '", sep = "\t", quote = FALSE)' robjects.r(wr) self._message.message_9("--- SAMSeq: is completed!") except RRuntimeError as rre: self._message.message_9("Error in SAMSeq execution: " + str(rre)) # raise rre
class BaySeq(object): """ Commands to run BaySeq expression analysis """ def __init__(self, count, group, repl, out): """ Define the edgeR object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._likelihood_column = 2 + len(group) * repl self._fdr_de_column = 4 + len(group) * repl self._likelihood = 0.95 self._fdr = 0.1 def run_de(self, gene): de = 0 try: fdr = float(gene[self._fdr_de_column]) like = float(gene[self._likelihood_column]) if fdr <= self._fdr and like > self._likelihood: de = 1 except ValueError: de = 0 return de def run_bayseq(self): """ Execute default analysis with baySeq :return: """ try: res = robjects.r('library("parallel")') res = robjects.r('library("stats4")') res = robjects.r('library("BiocGenerics")') res = robjects.r('library("S4Vectors")') res = robjects.r('library("IRanges")') res = robjects.r('library("GenomeInfoDb")') res = robjects.r('library("abind")') res = robjects.r('library("perm")') res = robjects.r('library("GenomicRanges")') res = robjects.r('library("baySeq")') res = robjects.r( 'if(require("parallel")) cl <- makeCluster(4) else cl <- NUL') ct = 'table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors = FALSE)' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') replicates = "" assert isinstance(self._replic, int) for ind in iter(self._groups_name): aux = "'" + ind + "', " replicates = replicates + aux * self._replic replicates = replicates[:(len(replicates) - 2)] replicates = 'replicates <- c(' + replicates + ')' res = robjects.r(replicates) groups = 'groups <- list(NDE = c(' + "1," * len(self._groups_name) groups = groups[:(len(groups) - 1)] + ')' groups = groups + ', DE = c(' + '1,' * self._replic groups = groups + '2,' * self._replic groups = groups[:(len(groups) - 1)] + "))" res = robjects.r(groups) res = robjects.r( 'CD <- new("countData", data = m, replicates = replicates, groups = groups)' ) res = robjects.r('libsizes(CD) <- getLibsizes(CD)') res = robjects.r( 'CD <- getPriors.NB(CD, samplesize = 1000, estimation = "QL", cl = cl, equalDispersions = TRUE)' ) res = robjects.r( 'CD <- getLikelihoods(CD, prs=c(0.5, 0.5), pET="BIC", cl=cl)') # CD.posteriors.DE < - exp(CD @ posteriors)[, 2] res = robjects.r( 'write.table(topCounts(CD, group = "DE", number = 65000, normaliseData = TRUE), "' + self._output + '", sep="\t", quote = FALSE)') self._message.message_9("--- baySeq is completed!") except RRuntimeError as rre: self._message.message_9("Error in baySeq execution: " + str(rre)) #raise rre #========================= TESTE da CLASSE============== # inp = '/home/juliana/Dropbox/UTFPR/PPGBIOINFO/Projeto/results_gencode/TopHat_results/bayseq/UHR_vs_Brain_gencode_TopHat_baySeq.csv' # grp = "g1", "g2" # rep = 7 # out = '/home/juliana/Documentos/Projeto_Juliana/Datasets/consexpression_baySeq_out.csv' # b = BaySeq(inp, grp, rep, out) # read_bay = open(inp, 'r') # c_b = 1 # for line in iter(read_bay): # #print('--' + line) # if c_b > 0: # gene = line.split("\t") # print(gene[0]) # v = b.run_de(gene) # print('--> '+ str(v)) # c_b += 1
class CountBo(object): """ This object define business rules to make count table execution """ def __init__(self, count): """ Test the doc of constructor class :param count: """ assert isinstance(count, CountVo) self._counter = count self.message = Message() def annotation_format(self): """ Verify format of annotation file (default: GTF | GFF) :return: void """ bar = self._counter.annotation_file.rfind('.') name = self._counter.annotation_file[bar:] if name != 'gtf' and name != 'gff': self.message.message_4('File extension of annotation file can be only GTF or GFF.') def execute_count(self): """ Execute command htseq-count :return: int subprocess """ text = self._counter.to_string() return_code = subprocess.call(text, shell=True) return return_code def merge_table_count(self, list_file, out, groups_name): """ Make a table whit count of all samples :param list_file: array count files :param out: text file line (gene) column (sample) data (count mapped) :param groups_name: treatment of samples :return: """ n_g = len(groups_name) group_count = 0 rep = int(len(list_file) / n_g) rep_count = 1 out_file = None gene = {} no_genes = {'__no_feature': 0, '__ambiguous': 0, '__too_low_aQual': 0, '__not_aligned': 0,'__alignment_not_unique': 0, 'not_aligned':0, 'no_feature':0, 'ambiguous':0, 'too_low_aQual':0, 'alignment_not_unique':0} out_file = open(out, 'w') out_file.write("gene") # loop table count by samples for ind in iter(list_file): op = open(ind, 'r') if rep_count <= rep: out_file.write("," + groups_name[group_count] + str(rep_count)) else: rep_count = 1 group_count += 1 out_file.write("," + groups_name[group_count]+str(rep_count)) for line in iter(op): line = line.rstrip() text = line.split("\t") if text[0] in no_genes: pass else: if text[0] in gene: aux = gene[text[0]] aux = aux + ',' + text[1] gene[text[0]] = aux else: gene[text[0]] = text[1] op.close() rep_count += 1 names = gene.keys() for i in iter(names): out_file.write("\n" + i + "," + str(gene[i])) out_file.close()
class EdgeR(object): def __init__(self, count, group, repl, out): """ Define the edgeR object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._column_result = [3,4] self._min_result = [] self._message = Message() self._logfc_colum = 1 self._pvalue_colum = 3 self._pvalue = 0.05 self._logfc = 2 def run_de(self, gene): de = 0 lfc = float(gene[self._logfc_colum]) pv = float(gene[self._pvalue_colum]) if lfc >= self._logfc or lfc <= -self._logfc: if pv >= self._pvalue: de = 1 return de def run_edger(self): """ Execute default analysis with edegeR :return: """ try: finish_message = "" res = robjects.r('library("limma")') res = robjects.r('library("edgeR")') ct = 'table <- read.csv("' \ + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE, sep = "' + "," + '")' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') grup = "" assert isinstance(self._replic, int) for ind in iter(self._groups_name): aux = "'" + ind + "', " grup = grup + aux * self._replic grup = grup[:(len(grup) - 2)] grup = 'group <- c(' + grup + ')' res = robjects.r(grup) res = robjects.r('y.dge <- DGEList(counts = m, group = group)') if (self._replic < 1): self._message.message_4(" Replicates not found by edgeR. EdgeR should be executed manual form.") elif (self._replic == 1): # edgeR manual based solution for without replicates res = robjects.r('bcv <- 0.2') res = robjects.r('y.et <- exactTest(y.dge, dispersion = bcv^2)') res = robjects.r('y.tp <- topTags(y.et, n = 100000)') res = robjects.r('y.pvalues <- y.et$table$PValue') wr = 'write.table(y.tp$table, "' + self._output + '", sep = "\t", quote = FALSE)' res = robjects.r(wr) finish_message = "--- edgeR without replicates is completed!" else: r('y.dge <- calcNormFactors(y.dge)') r('y.dge <- estimateDisp(y.dge)') r('y.dge <- estimateCommonDisp(y.dge)') r('y.et <- exactTest(y.dge)') r('y.tp <- topTags(y.et, n = 100000)') r('y.pvalues <- y.et$table$PValue') wr = 'write.table(y.tp$table, "' + self._output + '", sep = "\t", quote = FALSE)' r(wr) finish_message = "--- edgeR with replicates is completed!" self._message.message_9(finish_message) except RRuntimeError as rre: self._message.message_9("Error in edgeR execution: " + str(rre)) raise rre
class Noiseq(object): def __init__(self, count, group, repl, out): """ Define the NOISeq object :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._likelihood_column = len(group) + 3 self._likelihood = 0.95 def run_de(self, gene): de = 0 try: like = float(gene[self._likelihood_column]) if like >= self._likelihood: de = 1 except ValueError: de = 0 return de def run_noiseq(self): """ Execute default analysis with NOISeq :return: """ try: res = robjects.r('library("parallel")') res = robjects.r('library("splines")') res = robjects.r('library("Matrix")') res = robjects.r('library("BiocGenerics")') res = robjects.r('library("Biobase")') res = robjects.r('library("NOISeq")') ct = 'table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE)' res = robjects.r(ct) res = robjects.r('table <- as.matrix(table)') ts = "" run = "" tsrun = "" count_run = 1 assert isinstance(self._replic, int) for ind in iter(self._groups_name): aux = "'" + ind + "', " ts = ts + aux * self._replic while (count_run <= self._replic): tsrun = tsrun + "'" + ind + str(count_run) + "', " run = run + "'" + "R" + str(count_run) + "', " count_run += 1 count_run = 1 ts = ts[:(len(ts) - 2)] tsrun = tsrun[:(len(tsrun) - 2)] run = run[:(len(run) - 2)] res = robjects.r('myfactors = data.frame(Tissue=c(' + ts + '), TissueRun=c(' + tsrun + '), Run=c(' + run + '))') res = robjects.r( 'mydata <- readData(data = table, factors = myfactors)') res = robjects.r( 'mynoiseq = noiseq(mydata, k = 0.5, factor = "Tissue", lc = 1, replicates = "technical")' ) res = robjects.r('results <- head(mynoiseq@results)') res = robjects.r('write.csv(results, file="' + self._output + '", sep = "\t", quote = FALSE)') self._message.message_9("--- NOISeq: is completed!") except RRuntimeError as rre: self._message.message_9("Error in NOISeq execution: " + str(rre)) raise rre #========================= TESTE da CLASSE============== # inp = 'UHR_vs_Brain_gencode_TopHat_NOISeq.csv' # inp = 'consexpression_NOISeq.csv' # grp = "g1", "g2" # rep = 1 # out = 'consexpression_NOISeq_out.csv' # b = Noiseq(inp, grp, rep, out) # read_bay = open(inp, 'r') # c_b = 0 # for line in iter(read_bay): # #print('--' + line) # if c_b > 0: # gene = line.split(",") # print(gene[0]) # v = b.run_de(gene) # print('--> '+ str(v)) # c_b += 1
class LimmaVoom(object): def __init__(self, count, group, repl, out): """ Inite object Ebseq :param count: :param group: :param repl: :param out: """ self._table_count = count self._groups_name = group self._replic = repl self._output = out self._message = Message() self._logfc_column = 2 self._pvalue_column = 5 self._logfc = 2 self._pvalue = 0.05 def run_de(self, gene): de = 0 lfc = float(gene[self._logfc_column]) pv = float(gene[self._pvalue_column]) if lfc >= self._logfc or lfc <= -self._logfc: if pv >= self._pvalue: de = 1 return de def run_limmavoom(self): """ Execute default analysis with Limma-voom :return: """ if self._replic == 1: self._message.message_4( "limma-voom require more than one replics.") self._message.message_9("--- limma-voom: is kipped!") else: try: robjects.r('library("' + 'edgeR' + '")') robjects.r('library("' + 'limma' + '")') ct = 'table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors=FALSE)' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') res = robjects.r('nf = calcNormFactors(m, method = "TMM")') grup = "" for ind in iter(self._groups_name): grup = grup + ('"' + ind + '",') * self._replic grup = grup[:(len(grup) - 1)] robjects.r('condition = factor(c(' + grup + '))') res = robjects.r( 'voom.data <- voom(m, model.matrix(~factor(condition)), lib.ize = colSums(m) * nf)' ) res = robjects.r('voom.data$genes = rownames(m)') res = robjects.r( 'voom.fitlimma = lmFit(voom.data, design=model.matrix(~factor(condition)))' ) res = robjects.r('voom.fitbayes = eBayes(voom.fitlimma)') res = robjects.r('voom.pvalues = voom.fitbayes$p.value[, 2]') res = robjects.r( 'voom.adjpvalues = p.adjust(voom.pvalues, method="BH")') var = 'design <- c(' + '1,' * self._replic + '2,' * self._replic var = var[:(len(var) - 1)] + ')' res = robjects.r(var) res = robjects.r( 'data <- topTable(voom.fitbayes, coef=ncol(design), number=1000000)' ) wr = 'write.table(data, file="' + self._output + '", sep = "\t", quote = FALSE)' robjects.r(wr) self._message.message_9("--- limma-voom: is completed!") except RRuntimeError as rre: self._message.message_9("Error in limma-voom execution: " + str(rre)) raise rre
class BaySeq(object): """ Commands to run BaySeq expression analysis """ def __init__(self, count, group, repl, output): """ Define the edgeR object :param count: :param group: :param repl: :param output: """ self._table_count = count self._groups_name = group self._replic = repl self._output = output self._message = Message() self._likelihood_column = 2 + len(group) * repl self._fdr_de_column = 4 + len(group) * repl self._likelihood = 0.95 self._fdr = 0.1 def run_de(self, gene): de = 0 try: fdr = float(gene[self._fdr_de_column]) like = float(gene[self._likelihood_column]) if fdr <= self._fdr and like > self._likelihood: de = 1 except ValueError: de = 0 return de def run_bayseq(self): """ Execute default analysis with baySeq :return: """ try: res = robjects.r('library("parallel")') res = robjects.r('library("stats4")') res = robjects.r('library("BiocGenerics")') res = robjects.r('library("S4Vectors")') res = robjects.r('library("IRanges")') res = robjects.r('library("GenomeInfoDb")') res = robjects.r('library("abind")') # res = robjects.r('library("perm")') res = robjects.r('library("GenomicRanges")') res = robjects.r('library("baySeq")') res = robjects.r( 'if(require("parallel")) cl <- makeCluster(4) else cl <- NUL') ct = 'table <- read.csv("' + self._table_count + '", row.names = 1, header = TRUE, stringsAsFactors = FALSE)' res = robjects.r(ct) res = robjects.r('m <- as.matrix(table)') replicates = "" assert isinstance(self._replic, int) for ind in iter(self._groups_name): aux = "'" + ind + "', " replicates = replicates + aux * self._replic replicates = replicates[:(len(replicates) - 2)] replicates = 'replicates <- c(' + replicates + ')' res = robjects.r(replicates) groups = 'groups <- list(NDE = c(' + "1," * len(self._groups_name) groups = groups[:(len(groups) - 1)] + ')' groups = groups + ', DE = c(' + '1,' * self._replic groups = groups + '2,' * self._replic groups = groups[:(len(groups) - 1)] + "))" print(groups) res = robjects.r(groups) res = robjects.r( 'CD <- new("countData", data = m, replicates = replicates, groups = groups)' ) res = robjects.r('libsizes(CD) <- getLibsizes(CD)') res = robjects.r( 'CD <- getPriors.NB(CD, samplesize = 1000, estimation = "QL", cl = cl, equalDispersions = TRUE)' ) res = robjects.r( 'CD <- getLikelihoods(CD, prs=c(0.5, 0.5), pET="BIC", cl=cl)') # CD.posteriors.DE < - exp(CD @ posteriors)[, 2] res = robjects.r( 'write.table(topCounts(CD, group = "DE", number = 65000, normaliseData = TRUE), "' + self._output + '", sep="\t", quote = FALSE)') self._message.message_9("--- baySeq is completed!") except RRuntimeError as rre: self._message.message_9("Error in baySeq execution: " + str(rre)) raise rre