values_to_genes[count].append(gene) rank = 1 for i in exp_sorted: for gene in values_to_genes[i]: ranked_genes[gene] = rank rank += 1 return (ranked_genes) exp_file, column_num, sam_file, dup_file, gff_file, sample_file, loop_count = command_line( ) # Open GFF file and make dictionary containing gene names & their positions gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=False) # Store A. suecica list of DupHMM duplicated genes dup_list = [] with open(dup_file) as infile: for gene_name in infile: gene_name = gene_name.strip() dup_list.append(gene_name) # Get total number of reads w/ quality >= 20 for RPKM calculations #parse_sam = SAM.Parse(sam_file) #parse_sam.total_reads() #parse_sam.start() #total_reads = parse_sam.get_total_reads() # Dictionary which stores whether or not a gene has <10 reads for any given sample
help='GFF file containing start and end positions for each gene', metavar='GFF_File') args = parser.parse_args() in_folder = args.i o_file = args.o gff_file = args.gff return (in_folder, o_file, gff_file) in_folder, o_file, gff_file = command_line() # Open GFF file and make dictionary containing gene names & their positions # Then add the per-library gene read counts to a dictionary gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=True) library_gene_counts = {} for (gene, [chromosome, spos, epos]) in gff_genes_dict: if not gff_genes_dict.is_transposon( gene) and not gff_genes_dict.is_common_rna(gene): library_gene_counts[gene] = [] for root, subfolders, files in os.walk(in_folder): for f_name in files: print("Working on", str(f_name)) f_in = os.path.join(root, f_name) parse_sam = SAM.Parse(f_in) parse_sam.reads_per_gene(gff_genes_dict) parse_sam.start() temp_gene_counts = parse_sam.get_reads_per_gene() # Count up the total number of reads in the library
#parser.add_argument('-o', default="Data/RNA-Seq/sue1-mRNA-Seq/set3/Thalyrata/sue_mRNA_set3_sw_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="Data/RNA-Seq/Col_RNA-Seq/Thalyrata/SRR493036_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_01nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_02nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_03nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_04nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_05nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_06nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_07nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_08nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') #parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_09nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') args = parser.parse_args() i_file = args.i gff_file = args.gff o_file = args.o return(i_file, gff_file, o_file) i_file, gff_file, o_file = command_line() #gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5") gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5") gene_counts = SAM.reads_per_gene(i_file, gff_obj) with open(o_file, 'w') as outfile: outline = "Gene\tReads_mapped\n" outfile.write(outline) for gene, count in sorted(gene_counts.items(), key = lambda gene_counts: gene_counts[1], reverse=True): outline = str(gene) + "\t" + str(count) + "\n" outfile.write(outline)
dup_file = args.d gff_file = args.gff adj_value = args.adj use_random = args.r sun_genes_filename = args.o return(pileup_file, dup_file, gff_file, adj_value, use_random, sun_genes_filename) pileup_file, dup_file, gff_file, adj_value, use_random, sun_genes_filename = command_line() dup_list = [] with open(dup_file) as infile: for line in infile: line = line.strip() dup_list.append(line) gff_genes_dict = GFF.parse_genes(gff_file) if adj_value > 0: for gene_name, [chr, start_pos, end_pos] in gff_genes_dict.gene_dict.items(): start_pos = start_pos - adj_value end_pos = end_pos + adj_value gff_genes_dict.gene_dict[gene_name] = [chr, start_pos, end_pos] if use_random == "Y": # Construct a list of randomly selected genes to replace the list of duplicated genes #dup_list = sample(gff_genes_dict.gene_dict.keys(),len(dup_list)) dup_list = gff_genes_dict.gene_dict.keys() # Delete me soon dup_positions = ";".join([chr + ":" + str(s_pos) + "-" + str(e_pos) for gene_name, [chr,s_pos,e_pos] in sorted(gff_genes_dict.gene_dict.items(), key = lambda x: (x[1][0], x[1][1]) ) if gene_name in dup_list ] ) dup_pileup = Pileup.parse(pileup_file, False, dup_positions) suns_in_gene = Counter() suns_in_gene_prob = {}
'-o', default= "/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') args = parser.parse_args() i_file = args.i gff_file = args.gff o_file = args.o return (i_file, gff_file, o_file) i_file, gff_file, o_file = command_line() #gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5") gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5") parse_sam = SAM.Parse(i_file) parse_sam.reads_per_gene(gff_obj) parse_sam.start() gene_counts = parse_sam.get_reads_per_gene() with open(o_file, 'w') as outfile: outline = "Gene\tReads_mapped\n" outfile.write(outline) for gene, count in sorted(gene_counts.items(), key=lambda gene_counts: gene_counts[1], reverse=True): outline = str(gene) + "\t" + str(count) + "\n" outfile.write(outline)