def main(): amino_acids = set(codon_table.values()) - set('*') spike_nt_1, spike_aa_1 = generate_spike(30) spike_nt_2, spike_aa_2 = generate_spike(40) #print(spike_nt_1, spike_aa_1, spike_nt_2, spike_aa_2, sep='\n') query = randseq(200) + binf.reverse_complement(spike_nt_2) + randseq(200) + binf.reverse_complement(spike_nt_1) + randseq(200) subj = randseq(100, amino_acids) + spike_aa_1 + randseq(100, amino_acids) + spike_aa_2 + randseq(120, amino_acids) binf.write_fasta_seq(sys.stdout, 'Q1', query) binf.write_fasta_seq(sys.stdout, 'S1', subj)
def print_longest_isoforms_for_each_gene(in_file, delimiter, seq_lengths): for header, seq in binf.parse_fasta(in_file): seq_id, gene_id, isoform_id = extract_ids(header, delimiter) if gene_id not in seq_lengths: continue max_isoform_len = find_max_isoform_length_for_gene(gene_id, seq_lengths) if len(seq) == max_isoform_len: binf.write_fasta_seq(sys.stdout, header, seq) # To mark a given gene as already having had its longest isoform printed, # remove the gene from seq_lengths. del seq_lengths[gene_id]
def extract_exons(fasta_fname, gff_fname): sequences = HTSeq.FastaReader(fasta_fname) # end_included=True as (exon.end - exon.start) % 3 = 2. gff = HTSeq.GFF_Reader(gff_fname, end_included=True) features = defaultdict(lambda: defaultdict(list)) for feat in gff: features[feat.name][feat.type].append(feat) for kog, feats in features.items(): exons = feats['Exon'] exons = sorted(exons, key=lambda e: e.iv.start) seq = ''.join([str(sequences[exon.iv]) for exon in exons]) binf.write_fasta_seq(sys.stdout, kog, seq)
def main(): amino_acids = set(codon_table.values()) - set('*') spike_nt_1, spike_aa_1 = generate_spike(30) spike_nt_2, spike_aa_2 = generate_spike(40) #print(spike_nt_1, spike_aa_1, spike_nt_2, spike_aa_2, sep='\n') query = randseq(200) + binf.reverse_complement(spike_nt_2) + randseq( 200) + binf.reverse_complement(spike_nt_1) + randseq(200) subj = randseq(100, amino_acids) + spike_aa_1 + randseq( 100, amino_acids) + spike_aa_2 + randseq(120, amino_acids) binf.write_fasta_seq(sys.stdout, 'Q1', query) binf.write_fasta_seq(sys.stdout, 'S1', subj)
def print_longest_isoforms_for_each_gene(in_file, delimiter, seq_lengths): for header, seq in binf.parse_fasta(in_file): seq_id, gene_id, isoform_id = extract_ids(header, delimiter) if gene_id not in seq_lengths: continue max_isoform_len = find_max_isoform_length_for_gene( gene_id, seq_lengths) if len(seq) == max_isoform_len: binf.write_fasta_seq(sys.stdout, header, seq) # To mark a given gene as already having had its longest isoform printed, # remove the gene from seq_lengths. del seq_lengths[gene_id]
def main(): seq_set_id = sys.argv[1] munged_fasta_filename = sys.argv[2] mapping_filename = sys.argv[3] name_mapping = {} count = 1 with open(munged_fasta_filename, "w") as munged_fasta_file: for seq_id, seq in binf.parse_fasta(sys.stdin): new_name = "%s_prot%s" % (seq_set_id, count) name_mapping[new_name] = seq_id binf.write_fasta_seq(munged_fasta_file, new_name, seq) count += 1 with open(mapping_filename, "w") as mapping_file: json.dump(name_mapping, mapping_file)
def main(): seq_set_id = sys.argv[1] munged_fasta_filename = sys.argv[2] mapping_filename = sys.argv[3] name_mapping = {} count = 1 with open(munged_fasta_filename, 'w') as munged_fasta_file: for seq_id, seq in binf.parse_fasta(sys.stdin): new_name = '%s_prot%s' % (seq_set_id, count) name_mapping[new_name] = seq_id binf.write_fasta_seq(munged_fasta_file, new_name, seq) count += 1 with open(mapping_filename, 'w') as mapping_file: json.dump(name_mapping, mapping_file)