-
Notifications
You must be signed in to change notification settings - Fork 1
/
sxn_gene_variation.py
69 lines (59 loc) · 2.83 KB
/
sxn_gene_variation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
import argparse, os, gzip
import sys; sys.path.append('../Packages/')
from GenomeTools import GFF, SAM
def command_line():
parser = argparse.ArgumentParser(description="sxn_dup_gene_variation.py totals the number of reads mapping to each gene for all provided SxN F2 SAM file libraries. It then outputs the mean and variance of each gene into a .tsv file format.")
parser.add_argument('-i', default="Data/Sue/SxN_F2s/VarianceExp", type=str, help='Input folder containing all the SxN F2 SAM libraries.', metavar='Input_Folder')
parser.add_argument('-o', default="Output/SxN_Variance.tsv", type=str, help='Tab-separated value table containing the mean and variance for each gene.', metavar='Output_File')
parser.add_argument('-gff', default="../Thalyrata.gff", type=str, help='GFF file containing start and end positions for each gene', metavar='GFF_File')
args = parser.parse_args()
in_folder = args.i
o_file = args.o
gff_file = args.gff
return(in_folder, o_file, gff_file)
in_folder, o_file, gff_file = command_line()
# Open GFF file and make dictionary containing gene names & their positions
# Then add the per-library gene read counts to a dictionary
gff_genes_dict = GFF.Parse_genes(gff_file, create_nuc_dict=True)
library_gene_counts = {}
for (gene, [chromosome, spos, epos]) in gff_genes_dict:
library_gene_counts[gene] = []
for root, subfolders, files in os.walk(in_folder):
for f_name in files:
print("Working on", str(f_name))
f_in = os.path.join(root,f_name)
temp_gene_counts = SAM.reads_per_gene(f_in, gff_genes_dict)
# Count up the total number of reads in the library
gzipped = False
total_reads = 0
if f_in.endswith(".gz"):
infile = gzip.open(f_in, 'rb')
gzipped = True
else:
infile = open(f_in)
for line in infile:
if gzipped == True: line = str(line, encoding='utf8')
line = line.split()
if line[0][:1] != "@" and int(line[4]) >= 20: total_reads += 1
# Change read counts to RPKM value
for gene, count in temp_gene_counts.items():
(chromosome, s_pos, e_pos) = gff_genes_dict.loc(gene)
dist_kb = (e_pos - s_pos) / 1000
RPK = count / dist_kb
RPKM = RPK / (total_reads / 1000000)
library_gene_counts[gene].append(RPKM)
# Calculate gene read count means and variance
gene_means = {}
gene_variances = {}
for gene, RPKM_list in library_gene_counts.items():
gene_mean = sum(RPKM_list) / len(RPKM_list)
gene_means[gene] = gene_mean
gene_var = sum( [(x-gene_mean)**2 for x in RPKM_list] ) / (len(RPKM_list) - 1)
gene_variances[gene] = gene_var
with open(o_file, 'w') as outfile:
outline = "Gene\tMean\tVariance\n"
outfile.write(outline)
for gene in sorted(gff_genes_dict.gene_dict.keys()):
outline = "\t".join([str(gene), str(gene_means[gene]), str(gene_variances[gene])]) + "\n"
outfile.write(outline)