values_to_genes[count].append(gene) rank = 1 for i in exp_sorted: for gene in values_to_genes[i]: ranked_genes[gene] = rank rank += 1 return (ranked_genes) exp_file, column_num, sam_file, dup_file, gff_file, sample_file, loop_count = command_line( ) # Open GFF file and make dictionary containing gene names & their positions gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=False) # Store A. suecica list of DupHMM duplicated genes dup_list = [] with open(dup_file) as infile: for gene_name in infile: gene_name = gene_name.strip() dup_list.append(gene_name) # Get total number of reads w/ quality >= 20 for RPKM calculations #parse_sam = SAM.Parse(sam_file) #parse_sam.total_reads() #parse_sam.start() #total_reads = parse_sam.get_total_reads() # Dictionary which stores whether or not a gene has <10 reads for any given sample
help='GFF file containing start and end positions for each gene', metavar='GFF_File') args = parser.parse_args() in_folder = args.i o_file = args.o gff_file = args.gff return (in_folder, o_file, gff_file) in_folder, o_file, gff_file = command_line() # Open GFF file and make dictionary containing gene names & their positions # Then add the per-library gene read counts to a dictionary gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=True) library_gene_counts = {} for (gene, [chromosome, spos, epos]) in gff_genes_dict: if not gff_genes_dict.is_transposon( gene) and not gff_genes_dict.is_common_rna(gene): library_gene_counts[gene] = [] for root, subfolders, files in os.walk(in_folder): for f_name in files: print("Working on", str(f_name)) f_in = os.path.join(root, f_name) parse_sam = SAM.Parse(f_in) parse_sam.reads_per_gene(gff_genes_dict) parse_sam.start() temp_gene_counts = parse_sam.get_reads_per_gene() # Count up the total number of reads in the library