def get_donor_acceptor_sequences(genome_fasta_file, scaffold_gff3_file, window_size=10): """ When given a FASTA file and GFF3 file corresponding to the same genome, parse out and return (in a giant list of tuples) all the donor and acceptor sequences of all introns in the genome. """ genome_fasta = parse_fasta.get_all_sequences(genome_fasta_file, 'fasta') scaffold_gff3 = parse_gff3.parse_gff3(scaffold_gff3_file, 'exon') # create dictionary to store exon/intron coordinates # donor_acceptor_locs['scaffold_name'] = [tuple of ints] donor_acceptor_locs = {} for scaf in scaffold_gff3: donor_acceptor_locs[scaf] = [] # get donor and acceptor locations for scaf in scaffold_gff3: for gene in scaffold_gff3[scaf]: for tx in scaffold_gff3[scaf][gene].mRNAs: # get all exon coordinates within the transcripts tx_coords = scaffold_gff3[scaf][gene].mRNAs[tx].details['exon'] # check whether the transcript is reverse complemented rev_comp_flag = tx_coords[0][0] > tx_coords[0][1] if rev_comp_flag: donor_locs = [-y for x, y in tx_coords][:-1] acceptor_locs = [-x for x, y in tx_coords][1:] else: donor_locs = [y for x, y in tx_coords][:-1] acceptor_locs = [x for x, y in tx_coords][1:] for x in zip(donor_locs, acceptor_locs): donor_acceptor_locs[scaf].append(x) donor_acceptor_locs[scaf] = sorted(list(set( donor_acceptor_locs[scaf]))) # parse the locations into sequences donor_acceptor_sequences = [] for scaf in scaffold_gff3: for da in donor_acceptor_locs[scaf]: donor_seq = slice_window(genome_fasta[scaf], da[0]) acceptor_seq = slice_window(genome_fasta[scaf], da[1]) if donor_seq and acceptor_seq: donor_acceptor_sequences.append((donor_seq, acceptor_seq)) return donor_acceptor_sequences
def get_genomic_context(genome_fasta_file, pos_of_interest_file, window): """ Function that does the heavy lifting. Returns a dictionary of sequences. """ window_seqs = {} # read genome sequences genome_fasta = parse_fasta.get_all_sequences(genome_fasta_file, 'fasta') # read positions of interest tsv_reader = csv.reader(pos_of_interest_file, delimiter='\t') for row in tsv_reader: if len(row) < 2: continue scaf = row[0] pos = int(row[1]) - 1 # converting to 0-based numbering min_loc = max(0, pos - window) max_loc = min(len(genome_fasta[scaf]), pos + window + 1) window_sequence = genome_fasta[scaf][min_loc:max_loc] # if position is too close to the start/end of the scaffold, add padding # Ns in order to produce a sequence of (2 x WINDOW + 1) in length. front_n_needed = max(0, window - pos) back_n_needed = max(0, window - (len(genome_fasta[scaf]) - (pos + 1))) # build the sequence of the window if front_n_needed: window_sequence = 'N' * front_n_needed + window_sequence if back_n_needed: window_sequence += 'N' * back_n_needed annot = scaf + '_' + str(pos + 1) window_seqs[annot] = window_sequence return window_seqs
parser = argparse.ArgumentParser(description=""" CDS files do not contain intronic regions - this script reads the gff3 file containing start and end coordinates for genes, and extracts the genic (i.e. exonic + intronic) sequences for each gene.""") parser.add_argument('genome_fasta', metavar="fasta_file", type=argparse.FileType('r'), help="FASTA file of the genome.") parser.add_argument('scaffold_gff3', metavar="gff3_file", type=argparse.FileType('r'), help="corresponding gff3 file of the genome.") args = parser.parse_args() # read genome details into memory genome_fasta = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta') scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'gene') # read the positions from the cov file for scaf in scaffold_gff3: for gene in scaffold_gff3[scaf]: gene_coords = scaffold_gff3[scaf][gene].coords on_crick = gene_coords[0] > gene_coords[1] genic_seq = genome_fasta[scaf][min(gene_coords):max(gene_coords)] if on_crick: genic_seq = reverse_complement(genic_seq) print ('>' + gene) print (genic_seq)
import tempfile import parse_fasta parser = argparse.ArgumentParser(description=""" Script takes in two FASTA files of equal number of sequences, and does pairwise BLASTP (#1 vs. #1, #2 vs. #2, ...). Prints results to stdout.""") parser.add_argument('protein_fastas', metavar='fasta_file', type=argparse.FileType('r'), nargs=2, help='Pair of protein FASTA files.') args = parser.parse_args() first_fasta = parse_fasta.get_all_sequences(args.protein_fastas[0], 'fasta') second_fasta = parse_fasta.get_all_sequences(args.protein_fastas[1], 'fasta') assert len(first_fasta) == len(second_fasta), \ 'number of unique sequences in both files are different!' # all systems go! first_fasta = list(first_fasta.items()) second_fasta = list(second_fasta.items()) # header line print('Query', 'Hit accession', 'Hit description', 'Query length', 'Hit length',
parser = argparse.ArgumentParser(description=""" Based on the blastp results of the transcripts vs. sprot/trembl, create a GO annotation file for the transcripts.""") parser.add_argument('species', metavar="species_code", help="3/4-letter code for species in question.") parser.add_argument('prot_file', metavar="fasta_file", type=argparse.FileType('r'), help="protein FASTA file of the gene models.") parser.add_argument('-n', '--no_nr', action='store_true', help='script works without nr too!') parser.add_argument('-p', '--no_parents', action='store_true', help='use the annotation file that do not contain parents.') args = parser.parse_args() all_transcripts = parse_fasta.get_all_sequences(args.prot_file, 'fasta') sprot_tsv = open('{}_vs_sprot.tGO.tsv'.format(args.species)) trembl_tsv = open('{}_vs_trembl.tGO.tsv'.format(args.species)) if not args.no_nr: nr_tsv = open('{}_vs_nr.t1.tsv'.format(args.species)) go_tsv = open('{}_go_annots.{}all.tsv'.format(args.species, 'no_parents.' if args.no_parents else '')) transcript_go_terms = {} for line in go_tsv: cols = line.strip().split('\t') transcript_go_terms[cols[0]] = cols[1] def get_go_terms(transcript): if transcript in transcript_go_terms:
help='Tallied files are FASTQ, not FASTA.') parser.add_argument('--gzip', '-g', action='store_true', default=False, help='Tallied files are gzip-compressed.') args = parser.parse_args() # header row print('File', 'A', 'C', 'G', 'T', 'N', 'ACGT', 'ACGTN', 'GC%', sep='\t') for f in args.fasta_files: base_composition = collections.Counter() seqs = parse_fasta.get_all_sequences(f, 'fastq' if args.fastq else 'fasta', gzip_compressed=args.gzip, sequences_only=True) for s in seqs: base_composition += collections.Counter(s.upper()) a = base_composition['A'] c = base_composition['C'] g = base_composition['G'] t = base_composition['T'] acgt = a + c + g + t acgtn = sum(base_composition.values()) non_acgt = acgtn - acgt gc_pct = round((c + g) / acgt * 100, 3)
min([float(x.split('\t')[13]) for x in lines_per_query[q]])) o = [ q, hit_accessions, hit_descriptions, query_length, hit_lengths, query_coords, hit_coords, frames, max_bit_scores, total_bit_scores, identities, id_pct, coverage_pct, expects ] c_output += '\t'.join([str(x) for x in o]) + '\n' return c_output if args.remove_N: import parse_fasta query_fasta_seq = parse_fasta.get_all_sequences(args.remove_N, 'fasta') tree = xml.etree.ElementTree.parse(args.blast_xml) root = tree.getroot() # get list of <Iteration></Iteration> blastoutput_iterations = root.find('BlastOutput_iterations') iterations = blastoutput_iterations.findall('Iteration') # remove iterations that contain # "<Iteration_message>No hits found</Iteration_message>" for i in iterations: if i.find('Iteration_message') is not None: if i.find('Iteration_message').text == 'No hits found': blastoutput_iterations.remove(i)
dest='file_format', const='collapsed_fasta', help='input file is in collapsed FASTA format.') args = parser.parse_args() # file exists, counting begins read_lengths = {} # read_lengths[length] = total number nucleotide_stats = {} # nucleotide_stats[length] = {1: {'A': m, 'T': n ...}, # 2: {'A': m2, 'T': n2 ...}} VALID_NUCLEOTIDES = ['A', 'T', 'U', 'G', 'C', 'N'] # parse input file, and convert the option "cfasta" to "fasta" because # parse_fasta.py doesn't discriminate between those two formats import parse_fasta sequences = parse_fasta.get_all_sequences(args.reads_file[0], args.file_format[-5:]) # handle read inclusions/exclusions if args.include == None: included_reads = set(sequences.keys()) else: included_reads = set() for i in args.include: included_reads |= set([x.strip() for x in i]) # make sure that the reads exist in the original reads file included_reads &= set(sequences.keys()) if args.exclude != None: for e in args.exclude: included_reads -= set([x.strip() for x in e])
# gene itself. Thus, there's a possibility that # gene_info != AND exon_info == 0 !! if not ei_integer: return 'no_info' ei_integer = abs(int(ei_integer)) # NumPy ints aren't... really ints. ei = 'Exon' if ei_integer % 2 else 'Intron' # round() is dangerous. Banker's rounding. ei_number = (ei_integer + 1) // 2 ei_inverse = ei_number - exon_count[gene_id] - (ei_integer % 2) return '_'.join([ei, str(ei_number), str(ei_inverse)]) # read sequences sequence_lengths = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta', lengths_only=True) if args.verbose: print('[{}] Lengths for {} sequences parsed.'.format( time.asctime(), len(sequence_lengths)), file=sys.stderr) # read coordinates of genes and exons from .gff3 file. scaffold_gff3 = parse_gff3.parse_gff3(args.genome_gff3, 'exon') # as genes might contain overlapping isoforms, the longest isoform is chosen, # if multiples exist. scaffold_gff3 = parse_gff3.pick_longest_mRNA(scaffold_gff3) # make sure features in all mRNAs are sorted properly (for exon numbering). scaffold_gff3 = parse_gff3.sort_features(scaffold_gff3) # genic regions are denoted in a NumPy array as follows:
dest='file_format', const='fasta', help='input file is in FASTA format.') fasta_opt.add_argument('--fastq', action='store_const', dest='file_format', const='fastq', help='input file is in FASTQ format.') args = parser.parse_args() # grab only the sequences - ignore all annotations import parse_fasta sequences = parse_fasta.get_all_sequences(args.reads_file[0], args.file_format, sequences_only=True) # discard annotations import collections sequences = collections.Counter(sequences) reads_counter = 0 # the y/z/a counter mentioned in script description species_identifier = args.species[0][:3] for m in sequences.most_common(): # m[0] is the sequence; m[1] is the frequency print('>{}_{}_x{}'.format(species_identifier, reads_counter, m[1])) print(m[0]) reads_counter += m[1]
action='store_true', help='ORFs need not start with Met.') parser.add_argument('--nt', action='store_true', help='print equivalent nucleotide sequences.') parser.add_argument( '--print_length', action='store_true', help='include the length of the sequence in the annot.') parser.add_argument('--nosort', action='store_true', help='disable natural sorting on output.') args = parser.parse_args() fasta_seqs = parse_fasta.get_all_sequences(args.infile, 'fasta') if args.nosort: sorted_seqs = fasta_seqs else: sorted_seqs = natural_sort(fasta_seqs) if args.longest: for s in sorted_seqs: print( find_longest_orf(s, fasta_seqs[s], relaxed=args.relaxed, display_length=args.print_length)) else: for s in sorted_seqs:
help='follow order of singular file in --include.') args = parser.parse_args() # sanity checking if bool(args.start) != bool(args.end): raise ValueError('--start and --end has to be used in conjuction!') if args.order_include: # order_include can only be True when include is True if not args.include: args.order_include == False if len(args.include) > 1: args.order_include == False # start of script - get sequence data if args.fastq: sequences = parse_fasta.get_all_sequences(args.reads_file, 'fastq') else: sequences = parse_fasta.get_all_sequences(args.reads_file, 'fasta') # handle read inclusions/exclusions if not args.include: included_reads = set(sequences.keys()) else: included_reads = set() for i in args.include: included_reads |= set([x.strip() for x in i]) # make sure that the reads exist in the original reads file included_reads &= set(sequences.keys()) if args.exclude: