Esempio n. 1
0
        values_to_genes[count].append(gene)

    rank = 1
    for i in exp_sorted:
        for gene in values_to_genes[i]:
            ranked_genes[gene] = rank
        rank += 1

    return (ranked_genes)


exp_file, column_num, sam_file, dup_file, gff_file, sample_file, loop_count = command_line(
)

# Open GFF file and make dictionary containing gene names & their positions
gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=False)

# Store A. suecica list of DupHMM duplicated genes
dup_list = []
with open(dup_file) as infile:
    for gene_name in infile:
        gene_name = gene_name.strip()
        dup_list.append(gene_name)

# Get total number of reads w/ quality >= 20 for RPKM calculations
#parse_sam = SAM.Parse(sam_file)
#parse_sam.total_reads()
#parse_sam.start()
#total_reads = parse_sam.get_total_reads()

# Dictionary which stores whether or not a gene has <10 reads for any given sample
Esempio n. 2
0
        help='GFF file containing start and end positions for each gene',
        metavar='GFF_File')

    args = parser.parse_args()
    in_folder = args.i
    o_file = args.o
    gff_file = args.gff

    return (in_folder, o_file, gff_file)


in_folder, o_file, gff_file = command_line()

# Open GFF file and make dictionary containing gene names & their positions
# Then add the per-library gene read counts to a dictionary
gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=True)
library_gene_counts = {}
for (gene, [chromosome, spos, epos]) in gff_genes_dict:
    if not gff_genes_dict.is_transposon(
            gene) and not gff_genes_dict.is_common_rna(gene):
        library_gene_counts[gene] = []
for root, subfolders, files in os.walk(in_folder):
    for f_name in files:
        print("Working on", str(f_name))
        f_in = os.path.join(root, f_name)
        parse_sam = SAM.Parse(f_in)
        parse_sam.reads_per_gene(gff_genes_dict)
        parse_sam.start()
        temp_gene_counts = parse_sam.get_reads_per_gene()

        # Count up the total number of reads in the library
Esempio n. 3
0
	#parser.add_argument('-o', default="Data/RNA-Seq/sue1-mRNA-Seq/set3/Thalyrata/sue_mRNA_set3_sw_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="Data/RNA-Seq/Col_RNA-Seq/Thalyrata/SRR493036_aln_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_01nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_02nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_03nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_04nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_05nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_06nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_07nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_08nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	#parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_09nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	parser.add_argument('-o', default="/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile')
	
	args = parser.parse_args()
	i_file = args.i
	gff_file = args.gff
	o_file = args.o

	return(i_file, gff_file, o_file)

i_file, gff_file, o_file = command_line()
#gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5")
gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5")
gene_counts = SAM.reads_per_gene(i_file, gff_obj)
	
with open(o_file, 'w') as outfile:
	outline = "Gene\tReads_mapped\n"
	outfile.write(outline)
	for gene, count in sorted(gene_counts.items(), key = lambda gene_counts: gene_counts[1], reverse=True):
		outline = str(gene) + "\t" + str(count) + "\n"
		outfile.write(outline)
Esempio n. 4
0
	dup_file = args.d
	gff_file = args.gff
	adj_value = args.adj
	use_random = args.r
	sun_genes_filename = args.o
	
	return(pileup_file, dup_file, gff_file, adj_value, use_random, sun_genes_filename)
	
pileup_file, dup_file, gff_file, adj_value, use_random, sun_genes_filename = command_line()

dup_list = []
with open(dup_file) as infile:
	for line in infile:
		line = line.strip()
		dup_list.append(line)
gff_genes_dict = GFF.parse_genes(gff_file)
if adj_value > 0:
	for gene_name, [chr, start_pos, end_pos] in gff_genes_dict.gene_dict.items():
		start_pos = start_pos - adj_value
		end_pos = end_pos + adj_value
		gff_genes_dict.gene_dict[gene_name] = [chr, start_pos, end_pos]
if use_random == "Y":
	# Construct a list of randomly selected genes to replace the list of duplicated genes
	#dup_list = sample(gff_genes_dict.gene_dict.keys(),len(dup_list))
	dup_list = gff_genes_dict.gene_dict.keys() # Delete me soon
dup_positions = ";".join([chr + ":" + str(s_pos) + "-" + str(e_pos) for gene_name, [chr,s_pos,e_pos] in sorted(gff_genes_dict.gene_dict.items(), key = lambda x: (x[1][0], x[1][1]) ) if gene_name in dup_list ] )
dup_pileup = Pileup.parse(pileup_file, False, dup_positions)

suns_in_gene = Counter()
suns_in_gene_prob = {}
Esempio n. 5
0
        '-o',
        default=
        "/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv",
        type=str,
        help='Output file for gene-mapped read counts',
        metavar='OutputFile')

    args = parser.parse_args()
    i_file = args.i
    gff_file = args.gff
    o_file = args.o

    return (i_file, gff_file, o_file)


i_file, gff_file, o_file = command_line()
#gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5")
gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5")
parse_sam = SAM.Parse(i_file)
parse_sam.reads_per_gene(gff_obj)
parse_sam.start()
gene_counts = parse_sam.get_reads_per_gene()

with open(o_file, 'w') as outfile:
    outline = "Gene\tReads_mapped\n"
    outfile.write(outline)
    for gene, count in sorted(gene_counts.items(),
                              key=lambda gene_counts: gene_counts[1],
                              reverse=True):
        outline = str(gene) + "\t" + str(count) + "\n"
        outfile.write(outline)