def get_chromosome_lengths(rerence_multifasta): ''' Get chromosome lengths. @param rerence_multifasta: multifasta file with chromosomes @return: dictionary chr name -> chr length ''' print "Read reference genome" chrs = {} for seq_obj in sc_iter_fasta(rerence_multifasta): chrs[seq_obj.seq_gi] = seq_obj.seq_length print chrs
#!/usr/bin/env python # -*- coding: utf-8 -*- # #@created: 10.10.2013 #@author: Aleksey Komissarov #@contact: [email protected] import sys from trseeker.seqio.fasta_file import sc_iter_fasta import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description='Parse multi fasta.') parser.add_argument('-i','--input', help='Fasta input', required=True) parser.add_argument('-o','--output', help='Output prefix', required=True) args = vars(parser.parse_args()) fasta = args["input"] output = args["output"] for i, seq_obj in enumerate(sc_iter_fasta(fasta, lower=False)): name = seq_obj.header.split()[0] print name with open("%s.%s.fa" % (output, name), "w") as fh: fh.write(seq_obj.fasta)
jf_path = args["jf"] use_new = bool(args["new"]) k = int(args["k"]) c = int(args["cutoff"]) fh = open(args["output"], "w") if jf_api and use_new: jf_api = jellyfish.QueryMerFile(jf_path) kmer2freq = Kmer2tfAPI(jf_api) for sid, seq_obj in enumerate(sc_iter_fasta(input_fasta)): print seq_obj.header, "Length:", seq_obj.length sequence = seq_obj.sequence.upper() n = len(sequence) kmers = set() for i in xrange(n-k+1): kmer= sequence[i:i+k] tf = kmer2freq[kmer] if tf > 0: print i, tf raw_input("Next item?") # kmers.add() # kmers.add('A' + sequence[i:i+k-1]) # kmers.add('C' + sequence[i:i+k-1])
settings = { "index_prefix": "/mnt/guatemala/akomissarov/Boechera_spatifolia/raw.23.L3", "aindex_prefix": "/mnt/guatemala/akomissarov/Boechera_spatifolia/raw.23.L3", "reads_file": "/mnt/guatemala/akomissarov/Boechera_spatifolia/raw.reads", "gene_fasta": "/mnt/guatemala/akomissarov/Boechera_spatifolia/apr1.fa", } k = 23 index = load_aindex(settings) used_reads = set() results = [] for seq_obj in sc_iter_fasta(settings["gene_fasta"]): for i in xrange(seq_obj.length - k + 1): kmer = seq_obj.sequence[i:i + k] tf = index[kmer] if not tf: continue print i, kmer, tf hits = [] for data in get_reads_se_by_kmer(kmer, index, used_reads): start, next_read_start, subread, pos, spring_pos, was_reversed, poses_in_read = data used_reads.add((start, spring_pos)) hits.append([pos, 0, subread, poses_in_read, was_reversed]) if not hits:
#!/usr/bin/env python # -*- coding: utf-8 -*- # #@created: 10.10.2013 #@author: Aleksey Komissarov #@contact: [email protected] import sys from trseeker.seqio.fasta_file import sc_iter_fasta import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description='Check presence of adapter kmers.') parser.add_argument('-i','--input', help='Fasta input', required=True) parser.add_argument('-o','--output', help='Fixed output', required=True) args = vars(parser.parse_args()) fasta = args["input"] output = args["output"] with open(output, "w") as fh: for i, seq_obj in enumerate(sc_iter_fasta(fasta)): print i, "fix", seq_obj.seq_head seq_obj.seq_head = ">%s\n" % i fh.write(seq_obj.fasta)