metavar="fasta_file", type=argparse.FileType('r'), help="FASTA file of the genome.") parser.add_argument('scaffold_gff3', metavar="gff3_file", type=argparse.FileType('r'), help="corresponding gff3 file of the genome.") args = parser.parse_args() # read genome details into memory genome_fasta = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta') scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'exon') # pick longest transcript in the gff3 file scaffold_gff3 = parse_gff3.pick_longest_mRNA(scaffold_gff3) # read the positions from the cov file for scaf in scaffold_gff3: for gene in scaffold_gff3[scaf]: gene_coords = scaffold_gff3[scaf][gene].coords gene_on_crick = gene_coords[0] > gene_coords[1] tx = list(scaffold_gff3[scaf][gene].mRNAs.keys())[0] mrna_coords = scaffold_gff3[scaf][gene].mRNAs[tx].details['exon'] exon_seq = '' for i in mrna_coords: temp = genome_fasta[scaf][min(i):max(i)] if gene_on_crick: temp = reverse_complement(temp)
def generate_relative_locations(n): ''' Evenly split the range {0..1} depending on n (number of divisions), and returns the midpoint of the sub-ranges. i.e. if n == 5, return [0.1, 0.3, 0.5, 0.7, 0.9] ''' return [(x + 0.5) / n for x in range(n)] # read sequences genome_sequences = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta') # read coordinates of genes and exons from .gff3 file scaffold_gff3 = parse_gff3.pick_longest_mRNA( parse_gff3.parse_gff3(args.genome_gff3, 'exon')) # create dictionary to map genes to their respective scaffold (this is needed # to obtain gene coords based solely on gene names) gene_to_scaffold = {} for s in scaffold_gff3: for g in scaffold_gff3[s]: gene_to_scaffold[g] = s # print header row for results print('Gene', 'Intron relative location', 'Scaffold', 'Desired region', 'Outer region', 'Outer amplicon length',