Beispiel #1
0
        values_to_genes[count].append(gene)

    rank = 1
    for i in exp_sorted:
        for gene in values_to_genes[i]:
            ranked_genes[gene] = rank
        rank += 1

    return (ranked_genes)


exp_file, column_num, sam_file, dup_file, gff_file, sample_file, loop_count = command_line(
)

# Open GFF file and make dictionary containing gene names & their positions
gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=False)

# Store A. suecica list of DupHMM duplicated genes
dup_list = []
with open(dup_file) as infile:
    for gene_name in infile:
        gene_name = gene_name.strip()
        dup_list.append(gene_name)

# Get total number of reads w/ quality >= 20 for RPKM calculations
#parse_sam = SAM.Parse(sam_file)
#parse_sam.total_reads()
#parse_sam.start()
#total_reads = parse_sam.get_total_reads()

# Dictionary which stores whether or not a gene has <10 reads for any given sample
Beispiel #2
0
        help='GFF file containing start and end positions for each gene',
        metavar='GFF_File')

    args = parser.parse_args()
    in_folder = args.i
    o_file = args.o
    gff_file = args.gff

    return (in_folder, o_file, gff_file)


in_folder, o_file, gff_file = command_line()

# Open GFF file and make dictionary containing gene names & their positions
# Then add the per-library gene read counts to a dictionary
gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=True)
library_gene_counts = {}
for (gene, [chromosome, spos, epos]) in gff_genes_dict:
    if not gff_genes_dict.is_transposon(
            gene) and not gff_genes_dict.is_common_rna(gene):
        library_gene_counts[gene] = []
for root, subfolders, files in os.walk(in_folder):
    for f_name in files:
        print("Working on", str(f_name))
        f_in = os.path.join(root, f_name)
        parse_sam = SAM.Parse(f_in)
        parse_sam.reads_per_gene(gff_genes_dict)
        parse_sam.start()
        temp_gene_counts = parse_sam.get_reads_per_gene()

        # Count up the total number of reads in the library