in_folder, o_file, gff_file = command_line() # Open GFF file and make dictionary containing gene names & their positions # Then add the per-library gene read counts to a dictionary gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=True) library_gene_counts = {} for (gene, [chromosome, spos, epos]) in gff_genes_dict: if not gff_genes_dict.is_transposon( gene) and not gff_genes_dict.is_common_rna(gene): library_gene_counts[gene] = [] for root, subfolders, files in os.walk(in_folder): for f_name in files: print("Working on", str(f_name)) f_in = os.path.join(root, f_name) parse_sam = SAM.Parse(f_in) parse_sam.reads_per_gene(gff_genes_dict) parse_sam.start() temp_gene_counts = parse_sam.get_reads_per_gene() # Count up the total number of reads in the library gzipped = False total_reads = 0 if f_in.endswith(".gz"): infile = gzip.open(f_in, 'rb') gzipped = True else: infile = open(f_in) for line in infile: if gzipped == True: line = line.decode('utf-8') line = line.split()
return(in_folder, o_file, gff_file) in_folder, o_file, gff_file = command_line() # Open GFF file and make dictionary containing gene names & their positions # Then add the per-library gene read counts to a dictionary gff_genes_dict = GFF.Parse_genes(gff_file, create_nuc_dict=True) library_gene_counts = {} for (gene, [chromosome, spos, epos]) in gff_genes_dict: library_gene_counts[gene] = [] for root, subfolders, files in os.walk(in_folder): for f_name in files: print("Working on", str(f_name)) f_in = os.path.join(root,f_name) temp_gene_counts = SAM.reads_per_gene(f_in, gff_genes_dict) # Count up the total number of reads in the library gzipped = False total_reads = 0 if f_in.endswith(".gz"): infile = gzip.open(f_in, 'rb') gzipped = True else: infile = open(f_in) for line in infile: if gzipped == True: line = str(line, encoding='utf8') line = line.split() if line[0][:1] != "@" and int(line[4]) >= 20: total_reads += 1 # Change read counts to RPKM value
def simulate_reads(ref, chr, readlen, readcov, dupnum, duplen, sam, chr_len): final_SAM_fns = [] # List containing all SAM files to be combined together. First contains regular read simulation SAM file, with regional duplication SAM files added later chr_list = [] pattern = "Chr\d{1,}" recomp = re.compile(pattern) if chr == "All": for chr_temp in chr_len: match = recomp.match(chr_temp) if match: chr_list.append(chr_temp) else: chr_list = [entry for entry in chr.split(',')] for chr in chr_list: for i in range(0,len(chr_len[chr]),10000): out_fn = str(sam[:-3]) + "_temp_" + str(chr) + "_" + str(i) new_fasta = ">" + str(chr) + "\n" + chr_len[chr][i:i+10000] temp_fasta = "regtemp_" + str(i) + ".fasta" with open(temp_fasta, 'w') as outfile: outfile.write(new_fasta) params = ' '.join(["art_illumina.exe", "-i", str(temp_fasta), "-l", str(readlen), "-f", str(readcov), "-o", out_fn, "-sam", "-q"]) #Example usage: art_illumina.exe -i ../at9_chr3.fasta -l 34 -f 2 -o Output/RandomAtReads -sam simulation = subprocess.Popen(params) simulation.wait() new_out_fn = out_fn + ".sam" if i > 0: reg_SAM = SAM.parse(new_out_fn, True) for j in range(0,len(reg_SAM.sam_list)): reg_SAM.sam_list[j][3] = str(int(reg_SAM.sam_list[j][3]) + i) # Changing SAM file read numbers reg_SAM.output(new_out_fn) os.remove(temp_fasta) os.remove(out_fn + ".fq") os.remove(out_fn + ".aln") final_SAM_fns.append(new_out_fn) # Regular read profile files assigned here prev_sel = [] for i in range(1,dupnum+1): spos = 0 undupped_region = False ok_regions = 0 while undupped_region == False: chr = ''.join(sample(chr_list,1)) spos = randrange(0,len(chr_len[chr])) epos = spos + duplen if epos > len(chr_len[chr]): continue for entry in prev_sel: (c, start, end) = entry if chr == c: if not (int(spos) >= int(start) and int(spos) <= int(end)) or (int(epos) >= int(start) and int(epos) <= int(end)): # Testing to make sure that newly selected duplicated region is not within a region already selected to be duplicated ok_regions += 1 else: break else: ok_regions += 1 if ok_regions == len(prev_sel): undupped_region = True prev_sel.append((chr,spos,epos)) new_fasta = ">" + str(chr) + "\n" + chr_len[chr][spos:epos] temp_fasta = "duptemp_" + str(i) + ".fasta" with open(temp_fasta, 'w') as outfile: outfile.write(new_fasta) params = ' '.join(["art_illumina.exe", "-i", str(temp_fasta), "-l", str(readlen), "-f", str(readcov), "-o", str(temp_fasta[:-6]), "-sam", "-q"]) simulation = subprocess.Popen(params) simulation.wait() dup_fn = temp_fasta[:-6] + ".sam" dup_SAM = SAM.parse(dup_fn, True) for i in range(0,len(dup_SAM.sam_list)): dup_SAM.sam_list[i][3] = str(int(dup_SAM.sam_list[i][3]) + spos) dup_SAM.output(dup_fn[:-4] + "_temp.sam") os.remove(temp_fasta) os.remove(temp_fasta[:-6] + ".fq") os.remove(temp_fasta[:-6] + ".aln") os.remove(dup_fn) final_SAM_fns.append(dup_fn[:-4] + "_temp.sam") final_SAM = SAM.parse(final_SAM_fns, True) pattern2 = r"Chr\d{1}-(\d+)" recomp2 = re.compile(pattern2) max_read = final_SAM.sam_list[0][0] match = recomp2.match(max_read) max_num = int(match.group(1)) min_met = False for i in range(0, len(final_SAM.sam_list)): # Ensure that read names from one SAM file do not coincide with read names from another SAM file r_name = final_SAM.sam_list[i][0] match = recomp2.match(r_name) if match: r_num = int(match.group(1)) if r_num == 1 and min_met == False: min_met = True continue if min_met == True: max_num += 1 line = final_SAM.sam_list[i] line[0] = r_name[:5] + str(max_num) final_SAM.sam_list[i] = line final_SAM.header = sorted(final_SAM.header, key = lambda read: read[1][6:]) final_SAM.sam_list = sorted(final_SAM.sam_list, key = lambda read: int(read[0][5:])) for i in range(0,len(final_SAM.header)): # Set chromosome length in header portion of SAM file to correct length (will be 10,000 in temporary simulation SAM files) chr = final_SAM.header[i][1][3:] final_SAM.header[i][2] = "LN:" + str(len(chr_len[chr])) final_SAM.output(sam[:-3]) for file in final_SAM_fns: os.remove(file) params = ' '.join([str(pypath) + " FixARTSAMFile.py", "-i", sam[:-3], "-o", sam]) fixSAM = subprocess.Popen(params) # Replaces new CIGAR string format for matches, which uses = and X, to the old format M for both fixSAM.wait() print("Duplicated regions are located at:\n") for entry in sorted(prev_sel, key = lambda entry:(entry[0], entry[1])): (chr, spos, epos) = entry print(str(chr), ":", str(spos),"-", str(epos), sep="")
#parser.add_argument('-o', default="Data/RNA-Seq/sue1-mRNA-Seq/set3/Thalyrata/sue_mRNA_set3_sw_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="Data/RNA-Seq/Col_RNA-Seq/Thalyrata/SRR493036_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_01nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_02nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_03nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_04nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_05nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_06nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_07nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_08nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_09nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') args = parser.parse_args() i_file = args.i gff_file = args.gff o_file = args.o return(i_file, gff_file, o_file) i_file, gff_file, o_file = command_line() #gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5") gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5") gene_counts = SAM.reads_per_gene(i_file, gff_obj) with open(o_file, 'w') as outfile: outline = "Gene\tReads_mapped\n" outfile.write(outline) for gene, count in sorted(gene_counts.items(), key = lambda gene_counts: gene_counts[1], reverse=True): outline = str(gene) + "\t" + str(count) + "\n" outfile.write(outline)
exp_file, column_num, sam_file, dup_file, gff_file, loop_count = command_line() # Open GFF file and make dictionary containing gene names & their positions gff_genes_dict = GFF.Parse_genes(gff_file, create_nuc_dict=True) # Store A. suecica list of DupHMM duplicated genes dup_list = [] with open(dup_file) as infile: for line in infile: line = line.strip() dup_list.append(line) dup_list_len = len(dup_list) # Get total number of reads w/ quality >= 20 total_reads = SAM.total_reads(sam_file) # Store A. suecica gene counts gene_count_dict = Counter() with open(exp_file) as infile: for line in infile: line = line.split() gene_name = line[0] if gene_name != "Gene": gene_counts = int(line[column_num - 1]) (chromosome, s_pos, e_pos) = gff_genes_dict.loc(gene) dist_kb = (e_pos - s_pos) / 1000 RPK = count / dist_kb RPKM = RPK / (total_reads / 1000000) gene_count_dict[gene_name] = gene_counts
'-o', default= "/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') args = parser.parse_args() i_file = args.i gff_file = args.gff o_file = args.o return (i_file, gff_file, o_file) i_file, gff_file, o_file = command_line() #gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5") gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5") parse_sam = SAM.Parse(i_file) parse_sam.reads_per_gene(gff_obj) parse_sam.start() gene_counts = parse_sam.get_reads_per_gene() with open(o_file, 'w') as outfile: outline = "Gene\tReads_mapped\n" outfile.write(outline) for gene, count in sorted(gene_counts.items(), key=lambda gene_counts: gene_counts[1], reverse=True): outline = str(gene) + "\t" + str(count) + "\n" outfile.write(outline)