def DDI_annotation (extension, pfam_path, filter_flag, spe_level, interact, pfams, human_pfam, suffix): filelist = utilities.find_files(pfam_path, extension, None) for myfile in filelist: #myfile = pfam_path + "/" + samplelist + "/" + samplelist + ".interpro.PfamDomain.tsv" if not os.path.isfile(myfile): config.logger.info ("ERROR! File not exist: " + myfile) else: myout = re.sub(extension, suffix, myfile) myout_detail = re.sub(".tsv", ".detail.tsv", myout) peptide = collect_pfam_info (myfile) assign_interaction (filter_flag, spe_level, pfams, human_pfam, interact, peptide, myout, myout_detail)
def collect_pfam_info(cluster_mem, extension, ann_path, outfile): # list.txt pfams = {} filelist = utilities.find_files(ann_path, extension, None) for myfile in filelist: #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".interpro.PfamDomain.tsv" if not os.path.isfile(myfile): config.logger.info("ERROR! File not exist: " + myfile) continue open_file = open(myfile, "r") for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^" + utilities.PROTEIN_ID, line): continue info = line.split("\t") myid = info[0] if not myid in cluster_mem: continue pfam = info[1] if not myid in pfams: pfams[myid] = {} pfams[myid][pfam] = info[2] # foreach line open_file.close() # foreach samplelist # output details outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile) open_out = open(outfile1, "w") open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n") for myid in sorted(pfams.keys()): myacc = "" myann = "" for item in sorted(pfams[myid].keys()): if myacc == "": myacc = item else: myacc = myacc + ";" + item if myann == "": myann = pfams[myid][item] else: myann = myann + ";" + pfams[myid][item] open_out.write(myid + "\tPfam_PfamDomain\t" + myacc + "\t" + myann + "\n") # foreach seqID open_out.close() return pfams
def collect_counts(map_path, extension, gene_cluster): counts = {} mysample = {} ''' samples = {} open_file = open(sample_file, "r") for line in open_file: line = line.strip() if not len(line): continue samples[line.split("\t")[0]] = "" # foreach sample open_file.close() ''' filelist = utilities.find_files(map_path, extension, None) for myfile in filelist: mym = re.search("([^\/]+)$", myfile) sample = mym.group(1) sample = re.sub("." + extension, "", sample) if not os.path.isfile(myfile): config.logger.info("ERROR! File not exist: " + myfile) continue mysample[sample] = "" open_file = open(myfile, "r") for line in open_file: line = line.strip() if not len(line): continue if re.search("^#", line): continue if re.search("^Geneid", line): continue info = line.split("\t") myid = info[0] if not myid in gene_cluster: # not specified genes continue mycount = info[-1] if mycount == str(0): # no counts continue if not myid in counts: counts[myid] = {} counts[myid][sample] = mycount # foreach line open_file.close() # foreach samplelist return counts, mysample
def format_contig_info(contig_path, extension, outfile): filelist = utilities.find_files(contig_path, extension, None) open_out = open(outfile, "w") for myfile in filelist: myfile = myfile.strip() if not len(myfile): continue sample = myfile mym = re.search("([^\/]+)$", sample) sample = mym.group(1) sample = re.sub("." + extension, "", sample) # collect seq info if not os.path.isfile(myfile): config.logger.info("WARNING! Contig file doesn't exist!\t" + myfile) continue open_contig = open(myfile, "r") contigs = {} contig_order = [] myid = "" for line in open_contig: line = line.strip() if not len(line): continue if re.search("^>", line): mym = re.search(">([\S]+)", line) myid_old = mym.group(1) #myid_new = sample + "_contig_" + mym.group(1) + "|" + sample + "|" #myid = re.sub(myid_old + "[\s]+", myid_new, line) myid = ">" + sample + "_contig_" + mym.group(1) if not myid in contigs: contig_order.append(myid) contigs[myid] = "" continue contigs[myid] = contigs[myid] + line # foreach line open_contig.close() # output contig sequence for myid in contig_order: if myid in contigs: open_out.write(myid + "\n" + contigs[myid] + "\n") # foreach contig # foreach sample open_out.close()
def collect_transmembrane_info (cluster_mem, extension, ann_path, outfile): # list.txt transmem = {} details = {} filelist = utilities.find_files(ann_path, extension, None) for myfile in filelist: if not os.path.isfile(myfile): config.logger.info ("ERROR! File not exist: " + myfile) else: open_file = open(myfile, "r") titles = {} for line in open_file.readlines(): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^" + utilities.PROTEIN_ID, line): for item in info: titles[item] = info.index(item) continue myid = info[titles[utilities.PROTEIN_ID]] if not myid in cluster_mem: continue details[info[titles[utilities.PROTEIN_ID]]] = info[titles["Prediction"]] sample = re.sub("_[\d]+$", "", myid) if not sample in transmem: transmem[sample] = {} transmem[sample][info[titles[utilities.PROTEIN_ID]]] = info[titles["Prediction"]] # foreach line open_file.close() # foreach samplelist # output details outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile) open_out = open(outfile1, "w") open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n") for myid in sorted(details.keys()): myinfo = details[myid] open_out.write(myid + "\tTMHMM_transmembrane\tTMHMM_transmembrane\t" + myinfo + "\n") open_out.close() return transmem
def gene_calling(workflow, assembly_dir, assembly_extentsion, input_dir, extension, extension_paired, gene_call_type, prokka_dir, prodigal_dir, threads, gene_file, gene_PC_file, protein_file, protein_sort, gene_info, complete_gene, complete_protein): """ This set of tasks will run gene-calling workflow. Args: workflow (anadama2.workflow): An instance of the workflow class. assembly_dir: The direcory path of assembly results. sample_file: The sample list file. prokka_dir: The direcory path of prokka results. prodigal_dir: The direcory path of prodigal results. gene_file: The fasta file of gene nucleotide sequences. gene_PC_file: The fasta file of protein coding gene nucleotide sequences. protein_file: The fasta file of protein sequences. protein_sort: The sorted fasta file of protein sequences. gene_info: The summaized gene calling file. complete_gene: The fasta file of gene nucleotide sequences for complete ORFs. complete_protein: The fasta file of protein sequences for complete ORFs. Requires: prokka 1.14-dev: rapid prokaryotic genome annotation (recommend to close '-c' parameter in prodigal) prodigal v2.6: gene prediction usearch (tested with usearch v9.0.2132_i86linux64) assembled contig files Returns: string: name of gene files Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add gene calling tasks mygene, myprotein = preprocessing_tasks.gene_calling (workflow, assembly_dir, args.sample_file, prokka_dir, prodigal_dir, gene_file, gene_PC_file, protein_file, protein_sort, gene_info, complete_gene, complete_protein) # run the workflow workflow.go() """ config.logger.info("###### Start gene_calling module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config # ================================================ # collect sequences # ================================================ if extension_paired: extension_paireds = extension_paired.split(",") sample_files = utilities.find_files(input_dir, extension_paireds[0], None) samples = utilities.sample_names(sample_files, extension_paireds[0], None) else: sample_files = utilities.find_files(input_dir, extension, None) samples = utilities.sample_names(sample_files, extension, None) sequence_files = [] for mysample in samples: myfile = os.path.join(assembly_dir, mysample, mysample + "%s" % assembly_extentsion) sequence_files.append(myfile) # foreach sample filtered_contigs = sequence_files # ================================================ # Gene calling # ================================================ fna_file = [] faa_file = [] gff_files = [] fna_file_tmp = [] faa_file_tmp = [] gff_files_tmp = [] ## Using Prodigal if gene_call_type == "prodigal" or gene_call_type == "both": os.system("mkdir -p " + prodigal_dir) for contig in filtered_contigs: contig_base = os.path.basename(contig).split(os.extsep)[0] annotation_dir = os.path.join(prodigal_dir, contig_base) os.system("mkdir -p " + annotation_dir) gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base) cds_file = os.path.join(annotation_dir, '%s.fna' % contig_base) cds_aa = os.path.join(annotation_dir, '%s.faa' % contig_base) score = os.path.join(annotation_dir, '%s.gene_score.txt' % contig_base) stdout_log = os.path.join(annotation_dir, '%s.stdout.log' % contig_base) faa_file_tmp.append(cds_aa) workflow.add_task_gridable( 'prodigal -m -p meta -i [depends[0]] ' '-f gff -o [targets[0]] -d [targets[1]] -s [targets[3]] ' '-a [targets[2]] ' '>[args[0]] 2>&1', depends=[contig, TrackedExecutable("prodigal")], targets=[gff_file, cds_file, cds_aa, score], args=[stdout_log], cores=threads, mem=mem_equation, time=time_equation, name=contig_base + "__prodigal") for myfile in faa_file_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(prodigal_dir, myname) faa_file.append(myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) myfna = re.sub(".faa", ".fna", myfile) myfna_new = re.sub(".faa", ".fna", myfile_new) if gene_call_type == "prodigal": fna_file.append(myfna_new) mygff_new = re.sub(".faa", ".gff", myfile_new) gff_files.append(mygff_new) prokka_dir = prodigal_dir workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfna], targets=[myfna_new], cores=1, name="ln__" + os.path.basename(myfna)) mygff = re.sub(".faa", ".gff", myfile) mygff_new = re.sub(".faa", ".gff", myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[mygff], targets=[mygff_new], cores=1, name="ln__" + os.path.basename(mygff)) if gene_call_type == "prokka" or gene_call_type == "both": ## Calling genes with Prokka os.system("mkdir -p " + prokka_dir) for contig in filtered_contigs: contig_base = os.path.basename(contig).split(os.extsep)[0] sample = os.path.basename(contig_base) annotation_dir = os.path.join(prokka_dir, sample) os.system("mkdir -p " + annotation_dir) stdout_log = os.path.join( annotation_dir, '%s.prokka.bacteria.stdout.log' % contig_base) score = os.path.join(annotation_dir, '%s.gene_score.txt' % contig_base) gene_nuc = os.path.join(annotation_dir, '%s.ffn' % contig_base) gene_aa = os.path.join(annotation_dir, '%s.faa' % contig_base) gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base) fna_file_tmp.append(gene_nuc) gff_files_tmp.append(gff_file) workflow.add_task_gridable( 'prokka --prefix [args[0]] --addgenes --addmrna --force --metagenome ' '--cpus [args[2]] ' '--outdir [args[1]] [depends[0]] ' '>[args[3]] 2>&1 ', depends=[contig, TrackedExecutable("prokka")], targets=[gene_nuc, gene_aa, gff_file], args=[sample, annotation_dir, threads, stdout_log], cores=threads, mem=mem_equation, time=time_equation, name=contig_base + "__prokka") for myfile in gff_files_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(prokka_dir, myname) gff_files.append(myfile_new) for myfile in fna_file_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(prokka_dir, myname) fna_file.append(myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) myfaa = re.sub(".ffn", ".faa", myfile) myfaa_new = re.sub(".ffn", ".faa", myfile_new) if gene_call_type == "prokka": faa_file.append(myfaa_new) prodigal_dir = prokka_dir workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfaa], targets=[myfaa_new], cores=1, name="ln__" + os.path.basename(myfaa)) mygff = re.sub(".ffn", ".gff", myfile) mygff_new = re.sub(".ffn", ".gff", myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[mygff], targets=[mygff_new], cores=1, name="ln__" + os.path.basename(mygff)) # ================================================ # Summarize sequences # ================================================ #mem_equation = "50000" ### combine gene sequences ### nuc_type = "ffn" if gene_call_type == "prodigal": nuc_type = "fna" mylog = re.sub(".fna", ".log", gene_file) workflow.add_task( 'metawibele_combine_gene_sequences -p [args[0]] -e [args[1]] -o [targets[0]] > [args[2]] 2>&1 ', depends=utilities.add_to_list( fna_file, TrackedExecutable("metawibele_combine_gene_sequences")) + fna_file_tmp + gff_files + gff_files_tmp, targets=[gene_file], args=[prokka_dir, nuc_type, mylog], cores=1, name="combine_gene_sequences") ### combine protein sequences ### ## collect sequences mylog = re.sub(".faa", ".log", protein_file) workflow.add_task( 'metawibele_format_protein_sequences -p [args[0]] -q [args[1]] -e faa -o [targets[0]] ' '-m [targets[1]] >[args[2]] 2>&1 ', depends=utilities.add_to_list( faa_file, TrackedExecutable("metawibele_format_protein_sequences")) + faa_file_tmp + gff_files + gff_files_tmp, targets=[protein_file, gene_info], args=[prokka_dir, prodigal_dir, mylog], cores=1, name="format_protein_sequences") ## sort by length and filter out short-length sequence mylog = re.sub(".faa", ".log", protein_sort) workflow.add_task( 'usearch -sortbylength [depends[0]] ' '-fastaout [targets[0]] -minseqlength 0 >[args[0]] 2>&1 ', depends=[protein_file, TrackedExecutable("usearch")], targets=[protein_sort], args=[mylog], cores=1, name="usearch__sorting") ## extract nucleotide sequence for protein coding genes mylog = re.sub(".fna", ".log", gene_PC_file) workflow.add_task( 'metawibele_extract_protein_coding_genes -g [depends[0]] -p [depends[1]] -o [targets[0]] > [args[0]] 2>&1 ', depends=[ gene_file, protein_sort, TrackedExecutable("metawibele_extract_protein_coding_genes") ], targets=[gene_PC_file], args=[mylog], cores=1, name="extract_protein_coding_genes") ## extract sequences mylog = re.sub(".fna", ".log", complete_gene) workflow.add_task( 'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1', depends=[ gene_info, gene_PC_file, TrackedExecutable("metawibele_extract_complete_ORF_seq") ], targets=[complete_gene], args=[mylog], cores=1, name='extract_complete_ORF_seq') mylog = re.sub(".faa", ".log", complete_protein) workflow.add_task( 'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1', depends=[ gene_info, protein_sort, TrackedExecutable("metawibele_extract_complete_ORF_seq") ], targets=[complete_protein], args=[mylog], cores=1, name='extract_complete_ORF_seq') return complete_gene, complete_protein
def collect_sequence(ann_path, extension, partial_path, outfile): filelist = utilities.find_files(ann_path, extension, None) open_out = open(outfile, "w") outfile2 = re.sub(".faa", ".abnormal_seq.faa", outfile) outfile2 = re.sub(".fasta", ".abnormal_seq.fasta", outfile) open_out2 = open(outfile2, "w") gff = {} types = {} partial = {} for myfile in filelist: sample = myfile mym = re.search("([^\/]+)$", sample) sample = mym.group(1) sample = re.sub("." + extension, "", sample) # collect gff info that is corresponded the sequences file contigs = {} mapping = {} mygff = re.sub("." + extension, ".gff", myfile) if not os.path.isfile(mygff): config.logger.info("ERROR! Gff file doesn't exist!\t" + mygff) continue config.logger.info("Read gff file: " + mygff) open_gff = open(mygff, "r") for line in open_gff: line = line.strip() if not len(line): continue if re.search("^#", line): if re.search("^##sequence-region", line): mym = re.search( "##sequence-region\s+([\S]+)\s+([\d]+)\s+([\d]+)", line) tmp_contig = mym.group(1) contigs[tmp_contig] = str(mym.group(2)) + "\t" + str( mym.group(3)) if re.search("^# Sequence Data", line): mytmp = line.split(";") mym = re.sub("\"", "", mytmp[-1]) mym = re.search("^seqhdr\=([\S]+)[\s]+[\s\S]+len\=([\d]+)", mym) tmp_contig = mym.group(1) contigs[tmp_contig] = str(1) + "\t" + str(mym.group(2)) continue if re.search("^>", line): break info = line.split("\t") feature = info[2] start = info[3] stop = info[4] strand = info[6] desc = info[8] myinfo = desc.split(";") if re.search("ID\=([^\;]+)", desc): myid = re.search("ID\=([^\;]+)", desc) myid = myid.group(1) else: continue gene_name = "NA" gene_id = "NA" gene_num = "NA" sample_id = "NA" contig_id = info[0] contig_len = "NA\tNA" if contig_id in contigs: contig_len = str(contigs[contig_id]) for item in myinfo: if re.search("locus_tag=", item): mym = re.search("locus_tag=([\S]+)", item) gene_id = mym.group(1) if re.search("Name=", item): mym = re.search("Name=([\S]+)", item) gene_name = mym.group(1) # foreach item if not re.search("locus_tag=", desc): gene_id = sample + "_" + re.sub("_", "-", myid) if not re.search("Name=", desc): gene_name = sample + "_" + re.sub("_", "-", myid) if re.search("\_", gene_id): mym = re.search("^([^\_]+)\_([\S]+)", gene_id) sample_id = mym.group(1) gene_num = mym.group(2) if not re.search("locus_tag=", desc): gene = gene_id sample_id = sample else: gene = sample + "_" + gene_num contig = sample + "_contig_" + contig_id if feature == "gene": if not sample in gff: gff[sample] = {} if not gene_id in gff[sample]: gff[sample][ gene_id] = gene + "\t" + gene_id + "\t" + gene_name + "\t" + start + "\t" + stop + "\t" + strand + "\n" + contig + "\t" + contig_id + "\t" + contig_len + "\n" + sample + "\t" + sample_id if feature != "gene" and feature != "mRNA": if not sample in types: types[sample] = {} if not gene_id in types[sample]: types[sample][gene_id] = feature if feature == "CDS": new_id = contig_id + "\t" + start + "\t" + stop + "\t" + strand mapping[new_id] = gene + "\t" + gene_id if not re.search("locus_tag=", desc): if not sample in gff: gff[sample] = {} if not gene_id in gff[sample]: gff[sample][ gene_id] = gene + "\t" + gene_id + "\t" + gene_name + "\t" + start + "\t" + stop + "\t" + strand + "\n" + contig + "\t" + contig_id + "\t" + contig_len + "\n" + sample + "\t" + sample_id # foreach line open_gff.close() # collect sequences from prodigal results including pratial info myfile1 = re.sub(ann_path, partial_path, myfile) open_file = open(myfile1, "r") AA_seq = {} myname = "" flag = 0 hit_num = 0 total_num = 0 for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^>", line): # sequence id total_num = total_num + 1 line = re.sub("^>", "", line) info = line.split(" # ") if len(info) < 4: # debug config.logger.info("WARNING! No info!\t" + myfile1 + "\t" + line) continue myref = re.sub("_[\d]+$", "", info[0]) mystart = info[1] mystop = info[2] mystrand = "+" if info[3] == "-1": mystrand = "-" myid = myref + "\t" + mystart + "\t" + mystop + "\t" + mystrand myname = myid flag = 0 if myid in mapping: hit_num = hit_num + 1 # debug #print("Mapping:" + myid + "\t" + mapping[myid]) gene, gene_id = mapping[myid].split("\t") myname = ">" + gene if not myname in AA_seq: AA_seq[myname] = "" mym = re.search("partial=([\d]+)", info[-1]) mypartial = mym.group(1) if not sample in partial: partial[sample] = {} partial[sample][gene_id] = mypartial flag = 1 else: # debug config.logger.info("No mapping info!\t" + line) continue else: if flag == 1: if myname in AA_seq: AA_seq[myname] = AA_seq[myname] + line # foreach line open_file.close() if hit_num != total_num: open_file = open(myfile, "r") myname = "" for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^>", line): # sequence id mym = re.search("^>([^\_]+)\_([\S]+)", line) sample_id = mym.group(1) gene_num = mym.group(2) gene = sample + "_" + gene_num mym = re.search("^>([\S]+)", line) gene_id = mym.group(1) myname = ">" + gene if not myname in AA_seq: AA_seq[myname] = "" else: myname = "" continue if not sample in partial: partial[sample] = {} partial[sample][gene_id] = "00" else: if myname in AA_seq: AA_seq[myname] = AA_seq[myname] + line # foreach line open_file.close() for myname in sorted(AA_seq.keys()): myseq = AA_seq[myname] myseq = re.sub("\*$", "", myseq) AA_seq[myname] = myseq if re.search("\*", myseq): # terminal codon in CDS #print("Abnormal CDS\t" + sample + "\t" + myname) open_out2.write(myname + "\n" + AA_seq[myname] + "\n") continue else: open_out.write(myname + "\n" + AA_seq[myname] + "\n") # foreach sample open_out.close() return gff, types, partial
def collect_DDI_info(cluster_mem, extension, ann_path, level, label, outfile): # list.txt DDIs = {} anns = {} titles = {} filelist = utilities.find_files(ann_path, extension, None) for myfile in filelist: #myfile = ann_path + "/" + samplelist + "/" + samplelist + "." + suffix if not os.path.isfile(myfile): config.logger.info("ERROR! File not exist: " + myfile) continue open_file = open(myfile, "r") for line in open_file.readlines(): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^" + utilities.PROTEIN_ID, line): for item in info: titles[item] = info.index(item) continue myid = info[titles[utilities.PROTEIN_ID]] if not myid in cluster_mem: continue mytype = info[titles["Type"]] mylevel = info[titles["Interaction"]] mypfam = info[titles["Pfam1_ID"]] + ":" + info[titles["Pfam2_ID"]] myann = info[titles["Pfam1_ann"]] + ":" + info[titles["Pfam2_ann"]] if not myid in DDIs: DDIs[myid] = {} DDIs[myid][mylevel + "\t" + mypfam] = "" anns[mypfam] = myann # foreach line open_file.close() # foreach samplelist # output details outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile) outfile2 = re.sub(".tsv", ".detail.tsv", outfile1) open_out1 = open(outfile1, "w") open_out2 = open(outfile2, "w") open_out2.write(utilities.PROTEIN_ID + "\ttype\tdetail\tannotation\tinteraction\n") open_out1.write( utilities.PROTEIN_ID + "\tType\tInteraction\tPfam1_ID\tPfam2_ID\tPfam1_ann\tPfam2_ann\n") for myid in sorted(DDIs.keys()): mypfam = "" myann = "" mylevel = "" for item in sorted(DDIs[myid].keys()): tmp = item.split("\t") #myt = "DOMINE_interaction" myt = label myl = tmp[0] pfam1, pfam2 = tmp[1].split(":") ann1 = "NA:NA" if tmp[1] in anns: ann1 = anns[tmp[1]] ann1 = re.sub(":", "\t", ann1) open_out1.write(myid + "\t" + myt + "\t" + myl + "\t" + pfam1 + "\t" + pfam2 + "\t" + ann1 + "\n") if level != "no": if tmp[0] != "NA": if tmp[0] != level: continue mypfam = mypfam + tmp[1] + ";" mylevel = mylevel + tmp[0] + ";" if tmp[1] in anns: myann = myann + anns[tmp[1]] + ";" else: myann = myann + "NA;" # foreach DDI mypfam = re.sub(";$", "", mypfam) myann = re.sub(";$", "", myann) mylevel = re.sub(";$", "", mylevel) if mypfam == "": continue #open_out2.write(myid + "\tDOMINE_interaction\t" + mypfam + "\t" + myann + "\t" + mylevel + "\n") open_out2.write(myid + "\t" + label + "\t" + mypfam + "\t" + myann + "\t" + mylevel + "\n") # foreach seqID open_out1.close() open_out2.close() return DDIs, anns
def collect_ann_info(cluster_mem, extension, ann_path, types, outfile): # list.txt anns = {} anns_info = {} filelist = utilities.find_files(ann_path, extension, None) for myfile in filelist: for suffix in types: myfile1 = re.sub(extension, suffix, myfile) if not os.path.isfile(myfile1): #print ("File not exist!\t" + myfile1) continue open_file = open(myfile1, "r") mym = re.search("interpro.([\S]+).tsv", suffix) mytype = "InterProScan_" + mym.group(1) for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^" + utilities.PROTEIN_ID, line): continue info = line.split("\t") myid = info[0] if not myid in cluster_mem: continue acc = info[1] if not myid in anns: anns[myid] = {} if not mytype in anns[myid]: anns[myid][mytype] = {} if not mytype in anns_info: anns_info[mytype] = {} if info[2] == "": info[2] = "NA" if info[3] == "": info[3] = "NA" anns[myid][mytype][acc] = info[2] + "\t" + info[3] anns_info[mytype][acc] = info[2] + "\t" + info[3] # foreach line open_file.close() # foreach type of annotation #foreach samplelist # output details outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile) open_out = open(outfile1, "w") open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\tInterPro_accession\n") for myid in sorted(anns.keys()): for mytype in sorted(anns[myid].keys()): myinfo1 = "" myinfo2 = "" myinfo3 = "" for myacc in sorted(anns[myid][mytype].keys()): myinfo1 = myinfo1 + myacc + ";" tmp = anns[myid][mytype][myacc].split("\t") myinfo2 = myinfo2 + tmp[0] + ";" myinfo3 = myinfo3 + tmp[1] + ";" # foreach annotated accession myinfo1 = re.sub(";$", "", myinfo1) myinfo2 = re.sub(";$", "", myinfo2) myinfo3 = re.sub(";$", "", myinfo3) if myinfo1 == "": continue open_out.write(myid + "\t" + mytype + "\t" + myinfo1 + "\t" + myinfo2 + "\t" + myinfo3 + "\n") # foreach type # foreach seqID open_out.close() return anns, anns_info
def collect_localizing_info(cluster_mem, extension, ann_path, outfile): # list.txt gram_p = {} gram_n = {} archaea = {} location = {} score = {} location_n = {} location_p = {} location_a = {} filelist = utilities.find_files(ann_path, extension, None) for myfile in filelist: #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_positive.out.location.tsv" if not os.path.isfile(myfile): config.logger.info("ERROR! File not exist: " + myfile) else: open_file = open(myfile, "r") for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^name", line): continue info = line.split("\t") myid = info[0] if not myid in cluster_mem: continue sample = re.sub("_[\d]+$", "", myid) if not sample in gram_p: gram_p[sample] = {} if not info[0] in location: location[info[0]] = info[1] score[info[0]] = info[2] else: if float(info[2]) > float(score[info[0]]): location[info[0]] = info[1] score[info[0]] = info[2] location_p[info[0]] = info[1] gram_p[sample][info[0]] = info[1] # foreach line open_file.close() #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_negative.out.location.tsv" myfile1 = re.sub("psortb.gram_positive.out.location.tsv", "psortb.gram_negative.out.location.tsv", myfile) if not os.path.isfile(myfile1): config.logger.info("ERROR! File not exist: " + myfile1) else: open_file = open(myfile1, "r") for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^name", line): continue info = line.split("\t") myid = info[0] if not myid in cluster_mem: continue sample = re.sub("_[\d]+$", "", myid) if not sample in gram_n: gram_n[sample] = {} if not info[0] in location: location[info[0]] = info[1] score[info[0]] = info[2] else: if float(info[2]) > float(score[info[0]]): location[info[0]] = info[1] score[info[0]] = info[2] location_n[info[0]] = info[1] gram_n[sample][info[0]] = info[1] # foreach line open_file.close() #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".psortb.archaea.out.location.tsv" myfile1 = re.sub("psortb.gram_positive.out.location.tsv", "psortb.archaea.out.location.tsv", myfile) if not os.path.isfile(myfile1): config.logger.info("ERROR! File not exist: " + myfile1) else: open_file = open(myfile1, "r") for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^name", line): continue info = line.split("\t") myid = info[0] if not myid in cluster_mem: continue sample = re.sub("_[\d]+$", "", myid) if not sample in archaea: archaea[sample] = {} if not info[0] in location: location[info[0]] = info[1] score[info[0]] = info[2] else: if float(info[2]) > float(score[info[0]]): location[info[0]] = info[1] score[info[0]] = info[2] location_a[info[0]] = info[1] archaea[sample][info[0]] = info[1] # foreach line open_file.close() # foreach samplelist # output details outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile) open_out = open(outfile1, "w") open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tscore\n") for myid in sorted(location.keys()): mytype = location[myid] if mytype == "Unknown": mytype = "PSORTb_unknown" if mytype == "Cytoplasmic": mytype = "PSORTb_cytoplasmic" if mytype == "CytoplasmicMembrane": mytype = "PSORTb_cytoplasmicMembrane" if mytype == "Extracellular": mytype = "PSORTb_extracellular" if mytype == "Cellwall": mytype = "PSORTb_cellWall" if mytype == "OuterMembrane": mytype = "PSORTb_outerMembrane" if mytype == "Periplasmic": mytype = "PSORTb_periplasmic" myscore = "NA" if myid in score: myscore = score[myid] mydetail = re.sub("PSORTb_", "", mytype) open_out.write(myid + "\t" + mytype + "\t" + mydetail + "\t" + str(myscore) + "\n") # foreach seqID open_out.close() return gram_p, gram_n, archaea, location, score, location_p, location_n, location_a
def extract_psortb_info (extension, psortb_path): filelist = utilities.find_files(psortb_path, extension, None) for myfile in filelist: # gram+ #myfile = psortb_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_positive.out.txt" if not os.path.isfile(myfile): config.logger.info ("ERROR! File not exist: " + myfile) else: config.logger.info ("OK!\t" + myfile) open_file = open(myfile, "r") myid = "" out_p = [] flag = 0 for line in open_file: line = line.strip() if not len(line): continue if re.search("SeqID:", line): mym = re.search("SeqID:\s+([\S]+)", line) myid = mym.group(1) continue # id if re.search("Final", line): flag = 1 continue if flag == 1: if not len(line): continue line = re.sub("\s+", "\t", line) info = line.split("\t") mypredict = "NA" myscore = 0 if not re.search("[\S]+", info[0]): mypredict = info[1] myscore = info[-1] else: mypredict = info[0] myscore = info[-1] if mypredict == myscore: myscore = 0 flag = 0 if re.search("Unknown", mypredict): mypredic = "Unknown" myscore = re.sub("\s+", "", str(myscore)) if re.search("[a-zA-Z]+", myscore): myscore = 0 #open_out.write(myid + "\t" + mypredict + "\t" + str(myscore) + "\n") out_p.append(myid + "\t" + mypredict + "\t" + str(myscore)) # foreach line myout = re.sub(".txt", ".location.tsv", myfile) open_out = open(myout, "w") open_out.write("name\ttype\tscore\n") for item in out_p: open_out.write(item + "\n") open_file.close() # if file exist # gram- #myfile = psortb_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_negtive.out.txt" myfile1 = re.sub("psortb.gram_positive.out.txt", "psortb.gram_negative.out.txt", myfile) if not os.path.isfile(myfile1): config.logger.info ("ERROR! File not exist: " + myfile1) else: config.logger.info ("OK!\t" + myfile1) open_file = open(myfile1, "r") out_n = [] myid = "" flag = 0 for line in open_file: line = line.strip() if not len(line): continue if re.search("SeqID:", line): mym = re.search("SeqID:\s+([\S]+)", line) myid = mym.group(1) continue # id if re.search("Final", line): flag = 1 continue if flag == 1: if not len(line): continue line = re.sub("\s+", "\t", line) info = line.split("\t") mypredict = "NA" myscore = 0 if not re.search("[\S]+", info[0]): mypredict = info[1] myscore = info[-1] else: mypredict = info[0] myscore = info[-1] if mypredict == myscore: myscore = 0 flag = 0 if re.search("Unknown", mypredict): mypredic = "Unknown" myscore = re.sub("\s+", "", str(myscore)) if re.search("[a-zA-Z]+", myscore): myscore = 0 #open_out.write(myid + "\t" + mypredict + "\t" + str(myscore) + "\n") out_n.append(myid + "\t" + mypredict + "\t" + str(myscore)) # foreach line #myout = re.sub(".txt", ".location.tsv", myfile) myout = re.sub("gram_negative.out.txt", "gram_negative.out.location.tsv", myfile1) open_out = open(myout, "w") open_out.write("name\ttype\tscore\n") for item in out_n: open_out.write(item + "\n") open_out.close() # if file exist # archaea #myfile = psortb_path + "/" + samplelist + "/" + samplelist + ".psortb.archaea.out.txt" myfile1 = re.sub("psortb.gram_positive.out.txt", "psortb.archaea.out.txt", myfile) if not os.path.isfile(myfile1): config.logger.info ("ERROR! File not exist: " + myfile1) continue config.logger.info("OK!\t" + myfile1) open_file = open(myfile1, "r") myid = "" out_a = [] flag = 0 for line in open_file: line = line.strip() if not len(line): continue if re.search("SeqID:", line): mym = re.search("SeqID:\s+([\S]+)", line) myid = mym.group(1) continue # id if re.search("Final", line): flag = 1 continue if flag == 1: line = re.sub("\s+", "\t", line) info = line.split("\t") mypredict = "NA" myscore = 0 if not re.search("[\S]+", info[0]): mypredict = info[1] myscore = info[-1] else: mypredict = info[0] myscore = info[-1] if mypredict == myscore: myscore = 0 flag = 0 if re.search("Unknown", mypredict): mypredic = "Unknown" myscore = re.sub("\s+", "", str(myscore)) if re.search("[a-zA-Z]+", myscore): myscore = 0 #open_out.write(myid + "\t" + mypredict + "\t" + str(myscore) + "\n") out_a.append(myid + "\t" + mypredict + "\t" + str(myscore)) # foreach line myout = re.sub(".txt", ".location.tsv", myfile1) open_out = open(myout, "w") open_out.write("name\ttype\tscore\n") for item in out_a: open_out.write(item + "\n") open_file.close()
def collect_sequence(gene_path, extension, outfile): sampleid = {} filelist = utilities.find_files(gene_path, extension, None) open_out = open(outfile, "w") outfile1 = re.sub(".fna", "_protein_coding.fna", outfile) #open_out1 = open(outfile1, "w") for myfile in filelist: sample = myfile mym = re.search("([^\/]+)$", sample) sample = mym.group(1) sample = re.sub("." + extension, "", sample) mygff = re.sub("." + extension, ".gff", myfile) # collect protein-coding IDs gffs = {} open_gff = open(mygff, "r") for line in open_gff: line = line.strip() if not len(line): continue if re.search("^\#", line): continue if re.search("^\>", line): break info = line.split("\t") if info[2] == "CDS": # protein-coding genes mym = re.search("^ID=([^\;]+)", info[-1]) gffs[mym.group(1)] = "" # debug #print("Protein-coding gene:\t" + mym.group(1)) else: # debug config.logger.info("Skip non-CDS\t" + info[2]) # foreach line open_gff.close() # output sequences open_file = open(myfile, "r") flag = 0 for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^\>", line): # sequence id if re.search("ID\=", line): mym = re.search("ID\=([^\;]+)", line) mygene = mym.group(1) mym = re.search("\>([\S]+)", line) myid = mym.group(1) myid_new = sample + "_" + re.sub("_", "-", mygene) sampleid[sample] = sample line = re.sub(myid, myid_new, line) else: mym = re.search("\>([\S]+)", line) mygene = mym.group(1) mym = re.search("\>([^\_]+)", line) myid = mym.group(1) sampleid[sample] = myid line = re.sub(myid, sample, line) open_out.write(line + "\n") if mygene in gffs: flag = 1 #open_out1.write(line + "\n") else: # debug config.logger.info("Skip non protein coding sequences: " + mygene + "\t" + line) flag = 0 continue else: open_out.write(line + "\n") #if flag == 1: # open_out1.write(line + "\n") # foreach line open_file.close() # foeach sample open_out.close() #open_out1.close() return sampleid
def extract_interproscan_info (extension, interproscan_path): filelist = utilities.find_files(interproscan_path, extension, None) for myfile in filelist: #myfile = interproscan_path + "/" + samplelist + "/" + samplelist + ".interproscan.txt" if not os.path.isfile(myfile): config.logger.info ("ERROR! File not exist: " + myfile) else: config.logger.info ("OK!\t" + myfile) myout1 = re.sub(".interproscan.txt", ".signalp.signaling.tsv", myfile) myout2 = re.sub(".interproscan.txt", ".tmhmm.transmembrane.tsv", myfile) myout3 = re.sub(".interproscan.txt", ".phobius.signaling.tsv", myfile) myout4 = re.sub(".interproscan.txt", ".phobius.transmembrane.tsv", myfile) myout5 = re.sub(".interproscan.txt", ".interpro.PfamDomain.tsv", myfile) myout6 = re.sub(".interproscan.txt", ".interpro.SUPERFAMILY.tsv", myfile) myout7 = re.sub(".interproscan.txt", ".interpro.PROSITEPROFILES.tsv", myfile) myout8 = re.sub(".interproscan.txt", ".interpro.Gene3D.tsv", myfile) myout9 = re.sub(".interproscan.txt", ".interpro.PANTHER.tsv", myfile) myout10 = re.sub(".interproscan.txt", ".interpro.TIGRFAM.tsv", myfile) myout11 = re.sub(".interproscan.txt", ".interpro.SFLD.tsv", myfile) myout12 = re.sub(".interproscan.txt", ".interpro.ProDom.tsv", myfile) myout13 = re.sub(".interproscan.txt", ".interpro.Hamap.tsv", myfile) myout14 = re.sub(".interproscan.txt", ".interpro.SMART.tsv", myfile) myout15 = re.sub(".interproscan.txt", ".interpro.CDD.tsv", myfile) myout16 = re.sub(".interproscan.txt", ".interpro.PROSITEPATTERNS.tsv", myfile) myout17 = re.sub(".interproscan.txt", ".interpro.PRINTS.tsv", myfile) myout18 = re.sub(".interproscan.txt", ".interpro.PIRSF.tsv", myfile) myout19 = re.sub(".interproscan.txt", ".interpro.MobiDBLite.tsv", myfile) myout20 = re.sub(".interproscan.txt", ".interpro.Coils.tsv", myfile) open_out1 = open(myout1, "w") open_out2 = open(myout2, "w") open_out3 = open(myout3, "w") open_out4 = open(myout4, "w") open_out5 = open(myout5, "w") open_out6 = open(myout6, "w") open_out7 = open(myout7, "w") open_out8 = open(myout8, "w") open_out9 = open(myout9, "w") open_out10 = open(myout10, "w") open_out11 = open(myout11, "w") open_out12 = open(myout12, "w") open_out13 = open(myout13, "w") open_out14 = open(myout14, "w") open_out15 = open(myout15, "w") open_out16 = open(myout16, "w") open_out17 = open(myout17, "w") open_out18 = open(myout18, "w") open_out19 = open(myout19, "w") open_out20 = open(myout20, "w") open_file = open(myfile, "r") open_out1.write(utilities.PROTEIN_ID + "\tSP\tPrediction\tStart\tEnd\n") open_out2.write(utilities.PROTEIN_ID + "\tTM\tPrediction\tStart\tEnd\n") open_out3.write(utilities.PROTEIN_ID + "\tSP\tPrediction\tStart\tEnd\n") open_out4.write(utilities.PROTEIN_ID + "\tTM\tPrediction\tStart\tEnd\n") open_out5.write(utilities.PROTEIN_ID + "\tPfam\tDescription\tInterPro\tEvalue\n") open_out6.write(utilities.PROTEIN_ID + "\tSUPERFAMILY\tDescription\tInterPro\tEvalue\n") open_out7.write(utilities.PROTEIN_ID + "\tProSiteProfiles\tDescription\tInterPro\tEvalue\n") open_out8.write(utilities.PROTEIN_ID + "\tGene3D\tDescription\tInterPro\tEvalue\n") open_out9.write(utilities.PROTEIN_ID + "\tPANTHER\tDescription\tInterPro\tEvalue\n") open_out10.write(utilities.PROTEIN_ID + "\tTIGRFAM\tDescription\tInterPro\tEvalue\n") open_out11.write(utilities.PROTEIN_ID + "\tSFLD\tDescription\tInterPro\tEvalue\n") open_out12.write(utilities.PROTEIN_ID + "\tProDom\tDescription\tInterPro\tEvalue\n") open_out13.write(utilities.PROTEIN_ID + "\tHamap\tDescription\tInterPro\tEvalue\n") open_out14.write(utilities.PROTEIN_ID + "\tSMART\tDescription\tInterPro\tEvalue\n") open_out15.write(utilities.PROTEIN_ID + "\tCDD\tDescription\tInterPro\tEvalue\n") open_out16.write(utilities.PROTEIN_ID + "\tProSitePatterns\tDescription\tInterPro\tEvalue\n") open_out17.write(utilities.PROTEIN_ID + "\tPRINTS\tDescription\tInterPro\tEvalue\n") open_out18.write(utilities.PROTEIN_ID + "\tPIRSF\tDescription\tInterPro\tEvalue\n") open_out19.write(utilities.PROTEIN_ID + "\tMobiDBLite\tDescription\tInterPro\tEvalue\n") open_out20.write(utilities.PROTEIN_ID + "\tCoils\tDescription\tInterPro\tEvalue\n") for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^#", line): continue info = line.split("\t") myid = info[0] mytype = info[3] myacc = info[4] mydec = info[5] start = info[6] end = info[7] myscore = info[8] mystatus = info[9] if len(info) < 12: interproacc = "NA" interprodec = "NA" else: interproacc = info[11] interprodec = info[12] if start == "": start = "NA" if end == "": end = "NA" if interproacc == "": interproacc = "NA" if interprodec == "": interprodec = "NA" if mydec == "": mydec = "NA" if interprodec != "NA": mydec = interprodec if mystatus != "T": # not reliable prediction continue # SignalP if mytype == "SignalP_GRAM_NEGATIVE" or mytype == "SignalP_GRAM_POSITIVE": open_out1.write(myid + "\t" + mytype + "\t" + myacc + "\t" + start + "\t" + end + "\n") # TMHMM if mytype == "TMHMM": open_out2.write(myid + "\t" + mytype + "\t" + myacc + "\t" + start + "\t" + end + "\n") # Phobius if mytype == "Phobius": if re.search("SIGNAL_PEPTIDE", myacc): # signal peptide open_out3.write(myid + "\t" + myacc + "\t" + mydec + "\t" + start + "\t" + end + "\n") if re.search("TRANSMEMBRANE", myacc): # transmembrane open_out4.write(myid + "\t" + myacc + "\t" + mydec + "\t" + start + "\t" + end + "\n") # Pfam if mytype == "Pfam": open_out5.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # SUPERFAMILY if mytype == "SUPERFAMILY": open_out6.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # ProSiteProfiles if mytype == "ProSiteProfiles": open_out7.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # Gene3D if mytype == "Gene3D": open_out8.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # PANTHER if mytype == "PANTHER": open_out9.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # TIGRFAM if mytype == "TIGRFAM": open_out10.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # SFLD if mytype == "SFLD": open_out11.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # ProDom if mytype == "ProDom": open_out12.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # Hamap if mytype == "Hamap": open_out13.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # SMART if mytype == "SMART": open_out14.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # CDD if mytype == "CDD": open_out15.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # ProSitePatterns if mytype == "ProSitePatterns": open_out16.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # PRINTS if mytype == "PRINTS": open_out17.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # PIRSF if mytype == "PIRSF": open_out18.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # MobiDBLite if mytype == "MobiDBLite": open_out19.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # Coils if mytype == "Coils": open_out20.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") # foreach line open_out1.close() open_out2.close() open_out3.close() open_out4.close() open_out5.close() open_out6.close() open_out7.close() open_out8.close() open_out9.close() open_out10.close() open_out11.close() open_out12.close() open_out13.close() open_out14.close() open_out15.close() open_out16.close() open_out17.close() open_out18.close() open_out19.close() open_out20.close() open_file.close()
def assembly(workflow, input_dir, extension, extension_paired, threads, output_folder, contigs): """ This set of tasks will run assembly on the input files provided. Args: workflow (anadama2.workflow): An instance of the workflow class. input_dir: The direcory path of fastq files. extension: The extension for all reads files, e.g. .fastq.gz extension_paired: The extension for paired reads, e.g. _R1.fastq.gz,_R2.fastq.gz threads (int): The number of threads/cores for clustering to use. output_folder (string): The path of the output folder. contigs: The summarized contig file. Requires: metahit v1.1.3: A program for assembling metagenomic sequencing reads fastq files Returns: string: the name of contigs file. Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add assembly tasks mycontigs = preprocessing_tasks.assembly (workflow, input_dir, args.sample_file, args.extension_paired, args.extension_orphan, args.threads, assembly_dir, contigs) # run the workflow workflow.go() """ config.logger.info("###### Start assembly module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config # ================================================ # collect sequences # ================================================ pair_identifier = None pair_identifier2 = None if extension_paired: extension_paireds = extension_paired.split(",") pair_identifier = re.sub(extension, "", extension_paireds[0]) pair_identifier2 = re.sub("1", "2", pair_identifier) sample_files = utilities.find_files(input_dir, extension_paireds[0], None) samples = utilities.sample_names(sample_files, extension_paireds[0], None) else: extension_paireds = [extension] sample_files = utilities.find_files(input_dir, extension, None) samples = utilities.sample_names(sample_files, extension, None) split_dir = input_dir assembly_dir = output_folder split_files = [] contigs_list = [] for sample in samples: mypair = "none" myorphan = "none" mypair_tmp = [] for item in extension_paireds: if item == "none": continue myfile = os.path.join(split_dir, sample + item) if os.path.isfile(myfile): mypair_tmp.append(myfile) else: sys.exit("File not exist! " + myfile) if len(mypair_tmp) == 1: # split into paired reads files mypair_tmp = utilities.split_paired_reads(mypair_tmp[0], extension, pair_identifier) if len(mypair_tmp) == 1: myorphan = mypair_tmp[0] if len(mypair_tmp) == 2: mypair = ",".join(mypair_tmp) if len(mypair_tmp) == 3: mypair = ",".join(mypair_tmp[0:2]) myorphan = mypair_tmp[2] else: if len(mypair_tmp) == 2: mypair = ",".join(mypair_tmp) if len(mypair_tmp) == 3: tmp1 = [] tmp2 = [] for i in mypair_tmp: if re.search(pair_identifier, i): tmp1.append(i) elif re.search(pair_identifier2, i): tmp1.append(i) else: tmp2.append(i) if len(tmp1) > 0: mypair = ",".join(tmp1) if len(tmp2) > 0: myorphan = ",".join(tmp2) split_files.append((sample, mypair, myorphan)) seq_base = sample megahit_contig_dir = os.path.join(assembly_dir, seq_base) megahit_contig = os.path.join(megahit_contig_dir, '%s.contigs.fa' % seq_base) contigs_list.append(megahit_contig) ## run MEGAHIT os.system("mkdir -p " + assembly_dir) for (sample, mypair, myorphan) in split_files: seq_base = sample megahit_contig_dir = os.path.join(assembly_dir, seq_base) megahit_contig = os.path.join(megahit_contig_dir, '%s.contigs.fa' % seq_base) ## MEGAHIT needs memory in a byte format so let's take care of data #time_equation = "24*60 if file_size('[depends[0]]') < 25 else 6*24*60" # 24 hours or more depending on file size #mem_equation = "32*1024 if file_size('[depends[0]]') < 25 else 3*32*1024" # 32 GB or more depending on file size mylog = os.path.join(assembly_dir, '%s.log' % seq_base) if mypair != "none": tmp = mypair.split(",") if len(tmp) == 2: # paired reads: tmp = mypair.split(",") f_seq = tmp[0] r_seq = tmp[1] if myorphan != "none": workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -1 [depends[0]] -2 [depends[1]] -r [args[2]] -t [args[0]] -o [args[3]] --out-prefix [args[1]] >[args[4]] 2>&1", depends=[f_seq, r_seq, TrackedExecutable("megahit")], targets=[megahit_contig], args=[ threads, seq_base, myorphan, megahit_contig_dir, mylog ], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") else: workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -1 [depends[0]] -2 [depends[1]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1", depends=[f_seq, r_seq, TrackedExecutable("megahit")], targets=[megahit_contig], args=[threads, seq_base, megahit_contig_dir, mylog], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") else: workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1", depends=[mypair, TrackedExecutable("megahit")], targets=[megahit_contig], args=[threads, seq_base, megahit_contig_dir, mylog], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") else: if myorphan != "none": workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1", depends=[myorphan, TrackedExecutable("megahit")], targets=[megahit_contig], args=[threads, seq_base, megahit_contig_dir, mylog], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") for myfile in contigs_list: myname = os.path.basename(myfile) myfile_new = os.path.join(assembly_dir, myname) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) ## combine contigs sequences mylog = contigs + ".log" workflow.add_task( "metawibele_format_contig_sequences -p [args[0]] -e contigs.fa -o [targets[0]] > [args[1]] 2>&1", depends=utilities.add_to_list( contigs_list, TrackedExecutable("metawibele_format_contig_sequences")), targets=[contigs], args=[assembly_dir, mylog], cores=1, name="format_contig_table") return contigs_list
def collect_signaling_info(cluster_mem, extension, ann_path, outfile): # split.list gram_p = {} gram_n = {} signals = {} signals_n = {} signals_p = {} #open_list = open(listfile, "r") #for samplelist in open_list.readlines(): filelist = utilities.find_files(ann_path, extension, None) for myfile in filelist: if not os.path.isfile(myfile): config.logger.info("ERROR! File not exist: " + myfile) continue open_file = open(myfile, "r") titles = {} for line in open_file.readlines(): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^" + utilities.PROTEIN_ID, line): for item in info: titles[item] = info.index(item) continue if not info[titles[utilities.PROTEIN_ID]] in cluster_mem: continue if not info[titles[utilities.PROTEIN_ID]] in signals: signals[info[titles[utilities.PROTEIN_ID]]] = {} signals[info[titles[utilities.PROTEIN_ID]]][info[ titles["Prediction"]]] = "" mytype = info[titles["SP"]] sample = info[titles[utilities.PROTEIN_ID]] sample = re.sub("_[\d]+$", "", sample) if mytype == "SignalP_GRAM_POSITIVE": signals_p[info[titles[utilities.PROTEIN_ID]]] = info[ titles["Prediction"]] if not sample in gram_p: gram_p[sample] = 0 gram_p[sample] = gram_p[sample] + 1 if mytype == "SignalP_GRAM_NEGATIVE": signals_n[info[titles[utilities.PROTEIN_ID]]] = info[ titles["Prediction"]] if not sample in gram_n: gram_n[sample] = 0 gram_n[sample] = gram_n[sample] + 1 # foreach line open_file.close() # foreach sample # output details for each ORF outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile) open_out = open(outfile1, "w") open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n") for myid in sorted(signals.keys()): myinfo = ";".join(signals[myid].keys()) open_out.write(myid + "\tSignalP_signaling\tSignalP_signaling\t" + myinfo + "\n") # foreach seqID open_out.close() return gram_p, gram_n, signals, signals_p, signals_n
def gene_catalog(workflow, complete_gene, complete_protein, input_dir, extension, extension_paired, threads, prefix_gene_catalog, gene_catalog, gene_catalog_nuc, gene_catalog_prot, mapping_dir, gene_catalog_saf, gene_catalog_count): """ This set of tasks will build gene catalogs. Args: workflow (anadama2.workflow): An instance of the workflow class. complete_gene: The fasta file of gene nucleotide sequences for complete ORFs. complete_protein: The fasta file of protein sequences for complete ORFs. mapping_dir: The direcory path of mapping results. prefix_gene_catalog: The prefix of gene catalog file. gene_catalog: The gene catalog file. gene_catalog_nuc: The fastq file of nucleotide sequences for gene catalogs. gene_catalog_prot: The fastq file of protein sequences for gene catalogs. gene_catalog_saf: The SAF gtf file for gene catalogs. gene_catalog_count: The count file for gene catalogs. Requires: bowtie2 (tested with 2.3.2) samtools (tested with 1.5) featureCounts (tested with Version 1.6.2) the nucleotide and amino acid sequences for gene catalogs fastq files for each sample Returns: string: file names of gene catalogs Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add quality control tasks for the fastq files mygene_catalog, mycounts = preprocessing_tasks.gene_catalogs (workflow, complete_gene, complete_protein, mapping_dir, prefix_gene_catalog, gene_catalog, gene_catalog_nuc, gene_catalog_prot, gene_catalog_saf, gene_catalog_count) # run the workflow workflow.go() """ config.logger.info("###### Start gene_catalog module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config ### run gene-catalog workflow mylog = gene_catalog_nuc + ".log" myclust = gene_catalog_nuc + ".clstr" workflow.add_task( 'cd-hit-est -i [depends[0]] [args[0]] -o [targets[0]] >[args[1]] 2>&1 ', depends=[complete_gene, TrackedExecutable("cd-hit-est")], targets=[gene_catalog_nuc, myclust], args=[config.cd_hit_gene_opts, mylog], cores=threads, name="cd-hit-est") mylog = gene_catalog + ".log" workflow.add_task( 'metawibele_extract_cluster -c [depends[0]] -o [targets[0]] >[args[0]] 2>&1 ', depends=[myclust, TrackedExecutable("metawibele_extract_cluster")], targets=[gene_catalog], args=[mylog], cores=1, name="extract_cluster_CD-hit") mylog = gene_catalog_prot + ".log" workflow.add_task( 'metawibele_extract_non_redundance_seq -r [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1 ', depends=[ gene_catalog_nuc, complete_protein, TrackedExecutable("metawibele_extract_non_redundance_seq") ], targets=[gene_catalog_prot], args=[mylog], cores=1, name="extract_non_redundance_seq") ### get the abundance of gene catalog # run gene-abundance workflow mylog = gene_catalog_saf + ".log" workflow.add_task( 'metawibele_gene_abundance_indexRef -r [depends[0]] -t gene -b [args[0]] -o [targets[0]] >[args[1]] 2>&1 ', depends=[ gene_catalog_nuc, TrackedExecutable("metawibele_gene_abundance_indexRef") ], targets=[gene_catalog_saf], args=[prefix_gene_catalog, mylog], cores=1, name="gene_abundance_indexRef") ## collect sequences if extension_paired: extension_paireds = extension_paired.split(",") sample_files = utilities.find_files(input_dir, extension_paireds[0], None) samples = utilities.sample_names(sample_files, extension_paireds[0], None) else: sample_files = utilities.find_files(input_dir, extension, None) samples = utilities.sample_names(sample_files, extension, None) ## bowtie2 will map reads to gene categories flt_seqs = [] for sample in samples: seq_file = "NA" if extension_paired: tmp = extension_paired.split(",") else: if extension != "none": tmp = extension.split(",") for item in tmp: if seq_file == "NA": seq_file = os.path.join(input_dir, sample + '%s' % item) else: seq_file = seq_file + "," + os.path.join( input_dir, sample + '%s' % item) flt_seqs.append((sample, seq_file)) # foreah sample ## Now run bowtie2 to map reads to gene categories mappings = [] mappings_tmp = [] #mem_equation = "2*12*1024 if file_size('[depends[0]]') < 10 else 4*12*1024" #time_equation = "2*60 if file_size('[depends[0]]') < 10 else 2*2*60" for (sample, seq_file) in flt_seqs: seq_base = sample mydir = os.path.join(mapping_dir, sample) os.system("mkdir -p " + mydir) sample_counts = os.path.join(mydir, seq_base + ".sort.bed") stdout_log = os.path.join(mydir, '%s.mapping.stdout.log' % seq_base) mappings_tmp.append(sample_counts) workflow.add_task( 'metawibele_gene_abundance -r [depends[0]] -u [args[0]] -t [args[1]] -s [args[2]] -w [args[3]] ' '> [args[4]] 2>&1 ', depends=[ gene_catalog_nuc, gene_catalog_saf, TrackedExecutable("metawibele_gene_abundance") ], targets=[sample_counts], args=[seq_file, threads, seq_base, mydir, stdout_log], cores=1, name=sample + "__gene_abundance") for myfile in mappings_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(mapping_dir, myname) mappings.append(myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) # collect abundance mylog = gene_catalog_count + ".log" workflow.add_task( 'metawibele_gene_catalog_abundance -p [args[0]] -s sort.bed -c [args[1]] -o [targets[0]] >[args[2]] 2>&1 ', depends=utilities.add_to_list( mappings, TrackedExecutable("metawibele_gene_catalog_abundance")), targets=[gene_catalog_count], args=[mapping_dir, gene_catalog, mylog], cores=1, name="gene_catalog_abundance") return gene_catalog, gene_catalog_count
def collect_phobius_info(cluster_mem, extension, ann_path, outfile): # list.txt transmem = {} signal = {} detail_signal = {} detail_trans = {} filelist = utilities.find_files(ann_path, extension, None) for myfile in filelist: if not os.path.isfile(myfile): config.logger.info("ERROR! File not exist: " + myfile) else: open_file = open(myfile, "r") for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^" + utilities.PROTEIN_ID, line): continue info = line.split("\t") myid = info[0] if not myid in cluster_mem: continue sample = re.sub("_[\d]+$", "", myid) if not sample in signal: signal[sample] = {} signal[sample][info[0]] = info[2] detail_signal[info[0]] = info[2] # foreach line open_file.close() myfile = re.sub(extension, "phobius.transmembrane.tsv", myfile) if not os.path.isfile(myfile): config.logger.info("ERROR! File not exist: " + myfile) else: open_file = open(myfile, "r") for line in open_file.readlines(): line = line.strip() if not len(line): continue if re.search("^" + utilities.PROTEIN_ID, line): continue info = line.split("\t") myid = info[0] if not myid in cluster_mem: continue sample = re.sub("_[\d]+$", "", myid) if not sample in transmem: transmem[sample] = {} transmem[sample][info[0]] = info[2] detail_trans[info[0]] = info[2] # foreach line # foreach samplelist # output details outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.signaling.", outfile) open_out = open(outfile1, "w") open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n") for myid in sorted(detail_signal.keys()): myinfo = detail_signal[myid] open_out.write(myid + "\tPhobius_signaling\tPhobius_signaling\t" + myinfo + "\n") # foreach seqID open_out.close() outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.transmembrane.", outfile) open_out = open(outfile1, "w") open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n") for myid in sorted(detail_trans.keys()): myinfo = detail_trans[myid] open_out.write(myid + "\tPhobius_transmembrane\tPhobius_transmembrane\t" + myinfo + "\n") # foreach seqID open_out.close() return signal, transmem