in_folder, o_file, gff_file = command_line()

# Open GFF file and make dictionary containing gene names & their positions
# Then add the per-library gene read counts to a dictionary
gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=True)
library_gene_counts = {}
for (gene, [chromosome, spos, epos]) in gff_genes_dict:
    if not gff_genes_dict.is_transposon(
            gene) and not gff_genes_dict.is_common_rna(gene):
        library_gene_counts[gene] = []
for root, subfolders, files in os.walk(in_folder):
    for f_name in files:
        print("Working on", str(f_name))
        f_in = os.path.join(root, f_name)
        parse_sam = SAM.Parse(f_in)
        parse_sam.reads_per_gene(gff_genes_dict)
        parse_sam.start()
        temp_gene_counts = parse_sam.get_reads_per_gene()

        # Count up the total number of reads in the library
        gzipped = False
        total_reads = 0
        if f_in.endswith(".gz"):
            infile = gzip.open(f_in, 'rb')
            gzipped = True
        else:
            infile = open(f_in)
        for line in infile:
            if gzipped == True: line = line.decode('utf-8')
            line = line.split()
	
	return(in_folder, o_file, gff_file)

in_folder, o_file, gff_file = command_line()

# Open GFF file and make dictionary containing gene names & their positions
# Then add the per-library gene read counts to a dictionary
gff_genes_dict = GFF.Parse_genes(gff_file, create_nuc_dict=True)
library_gene_counts = {}
for (gene, [chromosome, spos, epos]) in gff_genes_dict:
	library_gene_counts[gene] = []
for root, subfolders, files in os.walk(in_folder):
	for f_name in files:
		print("Working on", str(f_name))
		f_in = os.path.join(root,f_name)
		temp_gene_counts = SAM.reads_per_gene(f_in, gff_genes_dict)
		
		# Count up the total number of reads in the library
		gzipped = False
		total_reads = 0
		if f_in.endswith(".gz"):
			infile = gzip.open(f_in, 'rb')
			gzipped = True
		else:
			infile = open(f_in)
		for line in infile:
			if gzipped == True: line = str(line, encoding='utf8')
			line = line.split()
			if line[0][:1] != "@" and int(line[4]) >= 20: total_reads += 1
		
		# Change read counts to RPKM value
Exemple #3
0
def simulate_reads(ref, chr, readlen, readcov, dupnum, duplen, sam, chr_len):
    final_SAM_fns = [] # List containing all SAM files to be combined together. First contains regular read simulation SAM file, with regional duplication SAM files added later
    chr_list = []
    pattern = "Chr\d{1,}"
    recomp = re.compile(pattern)
    if chr == "All":
        for chr_temp in chr_len:
            match = recomp.match(chr_temp)
            if match:
                chr_list.append(chr_temp)
    else:
        chr_list = [entry for entry in chr.split(',')]
    for chr in chr_list:
        for i in range(0,len(chr_len[chr]),10000):
            out_fn = str(sam[:-3]) + "_temp_" + str(chr) + "_" + str(i)
            new_fasta = ">" + str(chr) + "\n" + chr_len[chr][i:i+10000]
            temp_fasta = "regtemp_" + str(i) + ".fasta"
            with open(temp_fasta, 'w') as outfile:
                outfile.write(new_fasta)
            params = ' '.join(["art_illumina.exe", "-i", str(temp_fasta), "-l", str(readlen), "-f", str(readcov), "-o", out_fn, "-sam", "-q"])
            #Example usage: art_illumina.exe -i ../at9_chr3.fasta -l 34 -f 2 -o Output/RandomAtReads -sam
            simulation = subprocess.Popen(params)
            simulation.wait()
            new_out_fn = out_fn + ".sam"
            if i > 0:
                reg_SAM = SAM.parse(new_out_fn, True)
                for j in range(0,len(reg_SAM.sam_list)):
                    reg_SAM.sam_list[j][3] = str(int(reg_SAM.sam_list[j][3]) + i) # Changing SAM file read numbers
                reg_SAM.output(new_out_fn)
            os.remove(temp_fasta)
            os.remove(out_fn + ".fq")
            os.remove(out_fn + ".aln")
            final_SAM_fns.append(new_out_fn) # Regular read profile files assigned here

    prev_sel = []
    for i in range(1,dupnum+1):
        spos = 0
        undupped_region = False
        ok_regions = 0
        while undupped_region == False:
            chr = ''.join(sample(chr_list,1))
            spos = randrange(0,len(chr_len[chr]))
            epos = spos + duplen
            if epos > len(chr_len[chr]): continue
            for entry in prev_sel:
                (c, start, end) = entry
                if chr == c:
                    if not (int(spos) >= int(start) and int(spos) <= int(end)) or (int(epos) >= int(start) and int(epos) <= int(end)):
                        # Testing to make sure that newly selected duplicated region is not within a region already selected to be duplicated
                        ok_regions += 1
                    else:
                        break
                else:
                    ok_regions += 1
            if ok_regions == len(prev_sel):
                undupped_region = True
                prev_sel.append((chr,spos,epos))
        new_fasta = ">" + str(chr) + "\n" + chr_len[chr][spos:epos]
        temp_fasta = "duptemp_" + str(i) + ".fasta"
        with open(temp_fasta, 'w') as outfile:
            outfile.write(new_fasta)
        params = ' '.join(["art_illumina.exe", "-i", str(temp_fasta), "-l", str(readlen), "-f", str(readcov), "-o", str(temp_fasta[:-6]), "-sam", "-q"])
        simulation = subprocess.Popen(params)
        simulation.wait()
        dup_fn = temp_fasta[:-6] + ".sam"
        dup_SAM = SAM.parse(dup_fn, True)
        for i in range(0,len(dup_SAM.sam_list)):
            dup_SAM.sam_list[i][3] = str(int(dup_SAM.sam_list[i][3]) + spos)
        dup_SAM.output(dup_fn[:-4] + "_temp.sam")
        os.remove(temp_fasta)
        os.remove(temp_fasta[:-6] + ".fq")
        os.remove(temp_fasta[:-6] + ".aln")
        os.remove(dup_fn)
        final_SAM_fns.append(dup_fn[:-4] + "_temp.sam")
    
    final_SAM = SAM.parse(final_SAM_fns, True)
    pattern2 = r"Chr\d{1}-(\d+)"
    recomp2 = re.compile(pattern2)
    
    max_read = final_SAM.sam_list[0][0]
    match = recomp2.match(max_read)
    max_num = int(match.group(1))
    min_met = False
    for i in range(0, len(final_SAM.sam_list)): # Ensure that read names from one SAM file do not coincide with read names from another SAM file
        r_name = final_SAM.sam_list[i][0]
        match = recomp2.match(r_name)
        if match:
            r_num = int(match.group(1))
            if r_num == 1 and min_met == False:
                min_met = True
                continue
            if min_met == True:
                max_num += 1
                line = final_SAM.sam_list[i]
                line[0] = r_name[:5] + str(max_num)
                final_SAM.sam_list[i] = line
    final_SAM.header = sorted(final_SAM.header, key = lambda read: read[1][6:])
    final_SAM.sam_list = sorted(final_SAM.sam_list, key = lambda read: int(read[0][5:]))
    for i in range(0,len(final_SAM.header)): # Set chromosome length in header portion of SAM file to correct length (will be 10,000 in temporary simulation SAM files)
        chr = final_SAM.header[i][1][3:]
        final_SAM.header[i][2] = "LN:" + str(len(chr_len[chr]))
    final_SAM.output(sam[:-3])
    for file in final_SAM_fns:
        os.remove(file)
    params = ' '.join([str(pypath) + " FixARTSAMFile.py", "-i", sam[:-3], "-o", sam])
    fixSAM = subprocess.Popen(params) # Replaces new CIGAR string format for matches, which uses = and X, to the old format M for both
    fixSAM.wait()
    print("Duplicated regions are located at:\n")
    for entry in sorted(prev_sel, key = lambda entry:(entry[0], entry[1])):
        (chr, spos, epos) = entry
        print(str(chr), ":", str(spos),"-", str(epos), sep="")
	#parser.add_argument('-o', default="Data/RNA-Seq/sue1-mRNA-Seq/set3/Thalyrata/sue_mRNA_set3_sw_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="Data/RNA-Seq/Col_RNA-Seq/Thalyrata/SRR493036_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_01nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_02nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_03nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_04nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_05nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_06nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_07nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_08nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_09nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	
	args = parser.parse_args()
	i_file = args.i
	gff_file = args.gff
	o_file = args.o

	return(i_file, gff_file, o_file)

i_file, gff_file, o_file = command_line()
#gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5")
gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5")
gene_counts = SAM.reads_per_gene(i_file, gff_obj)
	
with open(o_file, 'w') as outfile:
	outline = "Gene\tReads_mapped\n"
	outfile.write(outline)
	for gene, count in sorted(gene_counts.items(), key = lambda gene_counts: gene_counts[1], reverse=True):
		outline = str(gene) + "\t" + str(count) + "\n"
		outfile.write(outline)
exp_file, column_num, sam_file, dup_file, gff_file, loop_count = command_line()

# Open GFF file and make dictionary containing gene names & their positions
gff_genes_dict = GFF.Parse_genes(gff_file, create_nuc_dict=True)

# Store A. suecica list of DupHMM duplicated genes
dup_list = []
with open(dup_file) as infile:
    for line in infile:
        line = line.strip()
        dup_list.append(line)
dup_list_len = len(dup_list)

# Get total number of reads w/ quality >= 20
total_reads = SAM.total_reads(sam_file)

# Store A. suecica gene counts
gene_count_dict = Counter()
with open(exp_file) as infile:
    for line in infile:
        line = line.split()
        gene_name = line[0]
        if gene_name != "Gene":
            gene_counts = int(line[column_num - 1])
            (chromosome, s_pos, e_pos) = gff_genes_dict.loc(gene)
            dist_kb = (e_pos - s_pos) / 1000
            RPK = count / dist_kb
            RPKM = RPK / (total_reads / 1000000)
            gene_count_dict[gene_name] = gene_counts
Exemple #6
0
        '-o',
        default=
        "/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv",
        type=str,
        help='Output file for gene-mapped read counts',
        metavar='OutputFile')

    args = parser.parse_args()
    i_file = args.i
    gff_file = args.gff
    o_file = args.o

    return (i_file, gff_file, o_file)


i_file, gff_file, o_file = command_line()
#gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5")
gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5")
parse_sam = SAM.Parse(i_file)
parse_sam.reads_per_gene(gff_obj)
parse_sam.start()
gene_counts = parse_sam.get_reads_per_gene()

with open(o_file, 'w') as outfile:
    outline = "Gene\tReads_mapped\n"
    outfile.write(outline)
    for gene, count in sorted(gene_counts.items(),
                              key=lambda gene_counts: gene_counts[1],
                              reverse=True):
        outline = str(gene) + "\t" + str(count) + "\n"
        outfile.write(outline)