def check_keyfn(path, klass, inplace): f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0]) assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys() fix(path) ff = Fasta(path, record_class=klass, flatten_inplace=inplace) assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass) fix(path)
def read_fa(fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'): gj.printFuncRun('read_fa') gj.printFuncArgs() fa_dict = Fasta(fa, key_fn=lambda key:key.split("\t")[0]) print fa_dict.keys()[0:3] gj.printFuncRun('read_fa') return fa_dict
def parse_align(train_fa, validation_fa, blastn_output, savefn): train_fa_dict = Fasta(train_fa) validation_fa_dict = Fasta(validation_fa) seq_similarity_dict = nested_dict(2, list) for i in list(validation_fa_dict.keys()): for j in list(train_fa_dict.keys()): seq_similarity_dict[i][j] = np.nan with open(blastn_output, 'r') as OUT: for line in OUT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') seq_similarity_dict[arr[0]][arr[1]] = -np.log10(float(arr[10])) seq_similarity_df = pd.DataFrame.from_dict(seq_similarity_dict, orient='index') fig, ax = plt.subplots(figsize=(12, 30)) sns.heatmap(seq_similarity_df.T.head(1000), xticklabels=False, yticklabels=False, cmap="YlGnBu") plt.tight_layout() plt.savefig(savefn) plt.close() return seq_similarity_df
def check_keyfn2(path, klass, inplace): f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: "-".join(key.split())) assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys() assert f['a-extra'] fix(path)
class Alg: def __init__(self, fastafn, freqfn, colorfn): self.pos = [] self.init = False self.size = 0 self.fasta = Fasta(fastafn) self.colorfn = colorfn self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'} self.read_fasta(fastafn) self.write_freqs(freqfn) def do_plot(self, plot, names = False): msa = self.seqtocol(self.colorfn, names= names) if plot: return(msa) def read_fasta(self, fastafn): for entry in self.fasta.keys(): seq = self.fasta[entry][:] if not self.init: # this assumes that all the entries in the fasta record are the same size. # this is the default setting for clustalo # TODO add an assertion ro verify so self.size = len(seq) for i in range(0, self.size): self.pos.append(Pos(i)) self.init = True for nt in range(0, self.size): self.pos[nt].freq[seq[nt].lower()]+=1 def seqtocol(self, outfn, names=False): outf = open(outfn, 'w') colors = [] for i,entry in enumerate(self.fasta.keys()): outf.write(entry+','+','.join([str(self.conta[i.lower()]) for i in self.fasta[entry][:]])+'\n') if names: colors.append(entry) [colors.append(self.conta[i.lower()]) for i in self.fasta[entry][:]] outf.close() # TODO thisis very weird, check why one option returns the transpose if names: #colors = np.array(colors).reshape( 1+i, 1+len(self.fasta[entry][:])) colors = np.array(colors).reshape( 1+len(self.fasta[entry][:]), 1+i) else: colors = np.array(colors).reshape(1+i, len(self.fasta[entry][:])) return(colors) def write_freqs(self, outfn): outf = open(outfn, 'w') outf.write('\t'.join(['a','c','t','g'])+'\n') for j in self.pos: outf.write('\t'.join([str(j.freq['a']),str(j.freq['c']),str(j.freq['t']),str(j.freq['g'])])+'\n') outf.close()
def extract_only_ref_variant_fasta(): f = Fasta(args.reference) if len(f.keys()) == 1: ref_id = str(f.keys()) ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir).readlines() core_vcf_file = args.filter2_only_snp_vcf_filename.replace( '_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_core.vcf.gz') fasta_string = "" count = 0 for lines in ffp: lines = lines.strip() grep_position = "zcat %s | grep -v \'#\' | awk -F\'\\t\' \'{ if ($2 == %s) print $0 }\' | awk -F\'\\t\' \'{print $5}\'" % ( core_vcf_file, lines) proc = subprocess.Popen([grep_position], stdout=subprocess.PIPE, shell=True) (out, err) = proc.communicate() out = out.strip() if out: if "," in out: split = out.split(',') fasta_string = fasta_string + split[0] print "HET SNP found: Position:%s; Taking the First SNP:%s" % ( lines, split[0]) count += 1 else: fasta_string = fasta_string + out count += 1 else: fasta_string = fasta_string + str( f.sequence({ 'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines) })) count += 1 pattern = re.compile(r'\s+') fasta_string = re.sub(pattern, '', fasta_string) final_fasta_string = ">%s\n" % os.path.basename( core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', '')) + fasta_string fp = open( "%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename( core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', ''))), 'w+') fp.write(final_fasta_string + '\n') fp.close()
def main(): args = make_parser() if args.inplace: f = Fasta(args.fasta_file, flatten_inplace=True) else: f = Fasta(args.fasta_file) if args.output_file is not None: output = open(args.output_file, 'w') else: output_file_name = args.fasta_file.split('.')[0] output_file = '{0}.phylip'.format(output_file_name) output = open(output_file, 'w') sequence_count = len(f.keys()) sequence_length = len(f[next(iter(f.keys()))]) # print('', sequence_count, sequence_length, sep=' ') output.write(' {0} {1}\n'.format(sequence_count, sequence_length)) for key in f.keys(): subseq = [] for chunk in grouper(f[key][:LINE_LENGTH], CHUNK_LENGTH): subseq.append(''.join(item[0] for item in chunk)) subseq = ' '.join(subseq) if len(key) < CHUNK_LENGTH: key = key.ljust(CHUNK_LENGTH) else: key = key[:CHUNK_LENGTH] # print(key, ' ', subseq) output.write('{0} {1}\n'.format(key, subseq)) sequence_length -= LINE_LENGTH start = LINE_LENGTH stop = LINE_LENGTH * 2 # print() output.write('\n') while sequence_length > 0: for key in f.keys(): subseq = [] for chunk in grouper(f[key][start:stop], CHUNK_LENGTH, ' '): subseq.append(''.join(item[0] for item in chunk)) subseq = ' '.join(subseq) # print(PAD_STRING, ' ', subseq) output.write('{0} {1}\n'.format(PAD_STRING, subseq)) sequence_length -= LINE_LENGTH start += LINE_LENGTH stop += LINE_LENGTH # print() output.write('\n') output.close()
def removehost(fasta, bed): removeregion = dict() with open(bed) as bedin: for i in bedin: removeregion[i.rstrip()] = 1 fa = Fasta(fasta) outfile = 'removehost_' + fasta outio = open(outfile, 'w') for seqname in fa.keys(): if seqname in removeregion: continue else: outst = '>' + seqname + '\n' + str(fa[seqname]) + '\n' outio.write(outst) outio.close()
def run(self, filename): self.openOutFiles(filename) f = Fasta(filename) count = len(f) self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0) for name in f.keys(): current += 1 if current % 1000 == 0: print "All %d. Current: %d" % (count, current) # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber} vGeneName = name.split("_")[0] vGeneRegions = self.getVGeneRegions(vGeneName) if vGeneRegions is None: continue withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:] group = self.findFR4(name, withoutMarkup) if group is None: continue self.result_kabat_file.write(name) self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions)) self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple( [vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]])) self.closeOutFiles() print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)
def aa_seq(options): """ Gets the ancestral sequence from a Fasta file """ f = Fasta(options.ancestralfasta) keyz = (f.keys()) match = '' if (options.single_chromosome): # Single chromosome fasta should only have one sequence. # that sequence should be the sequence of interest. keyz = list(keyz) key = keyz[0] else: get_chromosome_from_header = options.header get_chromosome_from_header = \ get_chromosome_from_header.replace('?', options.chromosome) for key in keyz: if(re.match(get_chromosome_from_header, key) is not None): match = key if(match is ''): raise Exception("No match possible is something wrong with the" " regex specified to the program as" "--header-regex") aaSeq = f[key] return(aaSeq)
def _no_empty(self, lista, listb): ''' removes empty entries ''' # check for empty fasta. tmpa = list() tmpb = list() for i in range(len(listb)): # open it. try: z = Fasta(listb[i], record_class=MemoryRecord) # check for empty. if len(z.keys()) == 0: continue # add to temp. tmpa.append(lista[i]) tmpb.append(listb[i]) except: logging.warning("bad fasta file") # sort back. return tmpa, tmpb
def aa_seq(options): """ Gets the ancestral sequence from a Fasta file """ f = Fasta(options.ancestralfasta) keyz = (f.keys()) match = '' if (options.single_chromosome): # Single chromosome fasta should only have one sequence. # that sequence should be the sequence of interest. keyz = list(keyz) key = keyz[0] else: get_chromosome_from_header = options.header get_chromosome_from_header = \ get_chromosome_from_header.replace('?', options.chromosome) for key in keyz: if (re.match(get_chromosome_from_header, key) is not None): match = key if (match is ''): raise Exception("No match possible is something wrong with the" " regex specified to the program as" "--header-regex") aaSeq = f[key] return (aaSeq)
def genome_contenct_stats(fasta_path): f = Fasta(fasta_path) g_box_total = [] for seqid in f.keys(): seq = f[seqid][:] g_boxs = len(re.findall('CACGTG',seq,flags=re.IGNORECASE)) g_box_total.append(g_boxs) print >>sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
def create_fasta_flat_file(file): """Reads a fasta file for fast sequence retrival""" fasta_file = Fasta(file, key_fn=lambda key: key.split()[0]) fasta_headers = set(fasta_file.keys()); return fasta_file, fasta_headers
def genome_contenct_stats(fasta_path): f = Fasta(fasta_path) g_box_total = [] for seqid in f.keys(): seq = f[seqid][:] g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE)) g_box_total.append(g_boxs) print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
def extract_reference_allele(): print "Extracting Reference Allele from Reference Fasta file - %s to REF\n" % args.reference # Get reference genome ID from reference fasta file get_reference = Fasta(args.reference) if len(get_reference.keys()) == 1: ref_id = get_reference.keys() print "The reference genome ID from reference genome - %s" % ref_id fileObj = open("REF", 'w+') fileObj.write('Ref' + '\n') for item in pos: ref_allele = str( get_reference.sequence({ 'chr': str(get_reference.keys()[0]), 'start': int(item), 'stop': int(item) })) fileObj.write(ref_allele + '\n') fileObj.close()
def split(args): parser = optparse.OptionParser("""\ split a fasta file into separated files. pyfasta split -n 6 [-k 5000 ] some.fasta the output will be some.1.fasta, some.2.fasta ... some.6.fasta the sizes will be as even as reasonable. """) parser.add_option("--header", dest="header", metavar="FILENAME_FMT", help="""this overrides all other options. if specified, it will split the file into a separate file for each header. it will be a template specifying the file name for each new file. e.g.: "%(fasta)s.%(seqid)s.fasta" where 'fasta' is the basename of the input fasta file and seqid is the header of each entry in the fasta file.""" ,default=None) parser.add_option("-n", "--n", type="int", dest="nsplits", help="number of new files to create") parser.add_option("-o", "--overlap", type="int", dest="overlap", help="overlap in basepairs", default=0) parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1, help="""\ split big files into pieces of this size in basepairs. default default of -1 means do not split the sequence up into k-mers, just split based on the headers. a reasonable value would be 10Kbp""") options, fasta = parser.parse_args(args) if not (fasta and (options.nsplits or options.header)): sys.exit(parser.print_help()) if isinstance(fasta, (tuple, list)): assert len(fasta) == 1, fasta fasta = fasta[0] kmer = options.kmers if options.kmers != -1 else None overlap = options.overlap if options.overlap != 0 else None f = Fasta(fasta) if options.header: names = dict([(seqid, options.header % \ dict(fasta=f.fasta_name, seqid=seqid)) \ for seqid in f.keys()]) """ if len(names) > 0: assert names[0][1] != names[1][1], ("problem with header format", options.header) fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]]) fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]]) """ return with_header_names(f, names) else: names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, header=options.header) #fhs = [open(n, 'wb') for n in names] if options.kmers == -1: return without_kmers(f, names) else: return with_kmers(f, names, options.kmers, options.overlap)
def mask_to_bed(fasta_file, mask_bed_name): "creates a bed file of the start and stops of masked seqs" mask_bed = open(mask_bed_name,"wb") f= Fasta(fasta_file) mask_id = 1 for seqid in f.keys(): seq = f[seqid][:] for m in re.finditer("X+",seq): mask_id = mask_id + 1 w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1)) mask_bed.write(w)
def mask_to_bed(fasta_file, mask_bed_name): "creates a bed file of the start and stops of masked seqs" mask_bed = open(mask_bed_name, "wb") f = Fasta(fasta_file) mask_id = 1 for seqid in f.keys(): seq = f[seqid][:] for m in re.finditer("X+", seq): mask_id = mask_id + 1 w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format( seqid, m.start(), m.end(), "mask_id {0}".format(mask_id), (m.end() - m.start()), (m.end() - m.start() + 1)) mask_bed.write(w)
def cut_up_genome(input_files_list, output_folder, region_length): for file in input_files_list: f = Fasta(file) chr = sorted(f.keys()) for chromosome in chr: sequence = f[chromosome] regions = [ sequence[i:i + region_length] for i in range(0, len(sequence), region_length) ] path = os.path.join(output_folder, f'chr={chromosome}') write_to_json(path, regions, region_length) print(f'{chromosome} is complete!')
def process_query(): print('Reading sequence library and query sequence') library = Fasta(library_path) queries = Fasta(query_path) query_sequence = str(queries["Rattus"]) print('Processing') progress = progressbar.ProgressBar(max_value=len(library.keys())) cpu_count = multiprocessing.cpu_count() executor = ThreadPoolExecutor(max_workers=cpu_count) tasks = [] for record in list(library.keys())[:library_process_limit]: library_sequence = str(library[record]) future = executor.submit(align, library_sequence, query_sequence) tasks.append(AlignmentTask(record, future)) results = [] for i in range(len(tasks)): _, _, score = tasks[i].future.result() results.append(AlignmentResult(title=tasks[i].record, score=score)) progress.update(i) etalone_score = sum([ smatrix[(x, x)] for x in query_sequence ]) print("Done") print("Etalone score is %d" % etalone_score) print("Got %d results, here are top-30 among them:" % len(results)) print("Score | Match | Record") for sequence in sorted(results, key=lambda x: x.score, reverse=True)[:30]: match = (sequence.score / etalone_score) * 100.0 print("%6d | %5.3f%% | %s" % (sequence.score, match, sequence.title)) timer = get_performance_timer() for time in [timer.dotplot, timer.regions, timer.align]: print(time / cpu_count)
def spgenome(fafile, outdir, maxsize=1000000000): spfiles = list() if path.exists(fafile): outfiles = dict() subfiles = dict() infa = Fasta(fafile) # nowsub = 0 nowlen = 0 for chrom in infa.keys(): chrlen = len(infa[chrom]) nowlen = nowlen+chrlen nowsub = int(nowlen/maxsize) if nowsub not in subfiles: subfilename = 'tmpfile' + str(nowsub) + '.fa' subfile = path.join(outdir,subfilename) spfiles.append(subfile) subfiles[nowsub] = open(subfile,'w') # outfiles[chrom] = nowsub print('>', chrom, sep='', file=subfiles[nowsub]) print(infa[chrom], file=subfiles[nowsub]) for nowsub in subfiles: subfiles[nowsub].close() else: print("Can't find ", fafile) return spfiles
class Sequence(): """docstring for Sequence""" def __init__(self, engine='mysql', function = 'iterator', **kwargs): self.engine = engine if self.engine == 'mysql' and function == 'iterator': self.create_mysql_iterator(**kwargs) elif self.engine == 'biopython' and kwargs['data_type'] == 'fasta': self.create_biopython_iterator(**kwargs) elif self.engine == 'pyfasta' and kwargs['data_type'] == 'fasta': self.create_pyfasta_iterator(**kwargs) elif self.engine == 'twobit' and kwargs['data_type'] == 'twobit': self.create_twobit_iterator(**kwargs) def create_mysql_iterator(self, **kwargs): cur = kwargs['cursor'] query = '''SELECT id, record FROM sequence WHERE n_count <= 2 AND trimmed_len > 40''' cur.execute(query) self.readcount = cur.rowcount self.read = iter(cur.fetchall()) def create_biopython_iterator(self, **kwargs): from Bio import SeqIO print "Generating BioPython sequence index. This may take a moment...." self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type']) self.readcount = len(self.fasta) self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys())) self.read = iter(self.db_values) def create_twobit_iterator(self, **kwargs): import bx.seq.twobit self.fasta = bx.seq.twobit.TwoBitFile(file(kwargs['input'])) self.readcount = self.fasta.seq_count self.db_values = zip(range(self.fasta.seq_count), sorted(self.fasta.keys())) self.read = iter(self.db_values) def create_pyfasta_iterator(self, **kwargs): from pyfasta import Fasta print "Generating PyFasta sequence index. This may take a moment...." self.fasta = Fasta(kwargs['input']) self.readcount = len(self.fasta) self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys())) self.read = iter(self.db_values) def get_pyfasta_reads(self, **kwargs): from pyfasta import Fasta self.fasta = Fasta(kwargs['input']) self.readcount = len(self.fasta)
def main(): """ select specific contigs from FASTA file """ if len(sys.argv) == 2: prefix = sys.argv[1] else: print "Usage: python select.py <prefix>; assume that <prefix>_BspQI_key.txt <prefix>.fasta and <prefix>_list.txt exist; output will be <prefix>_selected.fasta" return 0 ren = ReadTable(prefix+'_BspQI_key.txt', 4, '\t') # 4 lines of header print 'renaming table',ren select = ReadTable(prefix+'_list.txt', 0) # no header, text file of contigs numbers, one per line print 'select list',select # create a dictionary between contig id x[0] and (FASTA id x[1]) renaming = {} for x in ren: renaming[int(x[0])]=x[1] # contigs names are converted into integers, as well as length print 'renaming dictionary', renaming # collect the names of the contigs to be cut selected_list = [] for x in select: index = int(x[0]) # name of the contig to select, convert contig name into integer so we can match it #print 'index',index if index in renaming: selected_list.append(renaming[index]) # add the name of the contig else: print 'Error: contig',index,'does not exist' sys.exit(-1) print 'selected_list', selected_list # open the fasta file for reading fas = Fasta(prefix+'.fasta') # open the new fasta file for writing ofa = open(prefix+'_new.fasta','w') print 'writing new fasta' for x in sorted(fas.keys()): # process all the contigs one by one if x in selected_list: # if it needs to be split print 'Selecting',x ofa.write('>'+x+'\n') ofa.write(fas[x][:]+'\n') # entire contig else: print 'Not selecting',x ofa.close()
def generate_corpusfile(fasta_fname, n, corpus_fname): ''' Args: fasta_fname: corpus file name n: the number of chunks to split. In other words, "n" for "n-gram" corpus_fname: corpus_fnameput corpus file path Description: Protvec uses word2vec inside, and it requires to load corpus file to generate corpus. ''' f = open(corpus_fname, "w") fasta = Fasta(fasta_fname) for record_id in tqdm(fasta.keys(), desc='corpus generation progress'): r = fasta[record_id] seq = str(r) ngram_patterns = split_ngrams(seq, n) for ngram_pattern in ngram_patterns: f.write(" ".join(ngram_pattern) + "\n") f.close()
def get_sketch(fasta, n_kmers=100, k=15): # use a sample of kmers from a fastq hash_count = Counter() f = Fasta(fasta) for chrom in f.keys(): seq = f[chrom] for i in range(len(seq) - k): kmer = seq[i:i + k] hash_count[kmer] += 1 hashes_used = 0 hashed_sketch = [] for kmer in sorted(hash_count.keys()): if hashes_used <= n_kmers: #print(hash_count[i]) hashed_sketch.append(kmer) hashes_used += 1 return hashed_sketch
def read_fasta(ref_files, fasta_header): """Read fasta file New line character can only exist between header and sequence, not inside sequence Args: file_path (str): Path to fasta file. Returns: fasta_dict (dict): Dictionary with fasta headers as keys and the sequences as values. """ # Open fasta file and store headers and sequences for fasta_path in ref_files: # print(fasta_path) fasta = Fasta(fasta_path) if fasta_header in fasta.keys(): return fasta
def split_seqs(self, num_jobs, max_ref=5, max_qry=20): ''' splits reference and query into appropriate number of splits ''' # load data into memory. r = Fasta(self.ref_fasta, record_class=MemoryRecord) q = Fasta(self.qry_fasta, record_class=MemoryRecord) ## reference ## # split according to criteria. if len(r) < max_ref: max_ref = len(r) if max_ref > num_jobs: max_ref = 1 if len(q) < max_qry: max_qry = len(q) if num_jobs < max_qry: max_qry = num_jobs if (max_ref * max_qry) > num_jobs: max_qry = int(float(num_jobs) / float(max_ref)) # count number of seqs. sc = len(r.keys()) # create split info. self.ref_names = ["ref_%i" % x for x in range(max_ref)] self.ref_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.ref_names] # split according to rules. pyfasta.split_fasta.without_kmers(r, self.ref_files) self.ref_names, self.ref_files = self._no_empty(self.ref_names, self.ref_files) ## query ## # create split info. self.qry_names = ["qry_%i" % x for x in range(max_qry)] self.qry_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.qry_names] # split according to rules. pyfasta.split_fasta.without_kmers(q, self.qry_files) self.qry_names, self.qry_files = self._no_empty(self.qry_names, self.qry_files)
def main(): args = check_options(get_options()) fain = Fasta(args.input) faout = open(args.output, 'w') minlen = int(1e6) print(minlen) shortseq = 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' breacker = 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' shortlist = list() for chrome in fain.keys(): if len(fain[chrome]) < minlen: # print(chrome, len(fain[chrome])) # shortseq = shortseq + str(fain[chrome]) + breacker shortlist.append(chrome) else: print(chrome, len(fain[chrome])) print('>%s' % chrome, file=faout) print(fain[chrome], file=faout) print('>shortsequences', file=faout) for chrome in shortlist: print(str(fain[chrome]),shortseq,sep='',end='', file=faout) # print(shortseq, file=faout) faout.close()
def run(self, filename): self.openOutFiles(filename) f = Fasta(filename) count = len(f) self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0) for name in f.keys(): current += 1 if current % 1000 == 0: print "All %d. Current: %d" % (count, current) # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber} vGeneName = name.split("_")[0] vGeneRegions = self.getVGeneRegions(vGeneName) if vGeneRegions is None: continue withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:] group = self.findFR4(name, withoutMarkup) if group is None: continue self.result_kabat_file.write(name) self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions)) self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple([ vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)] ])) self.closeOutFiles() print "all: {}; not in kabat: {}; without fr4: {}".format( current, self.not_found_in_kabat, self.fr4_not_found)
def main(): """ select contigs from FASTA file that do not have "reads=1" on their header """ if len(sys.argv) == 2: prefix = sys.argv[1] else: print "Usage: python get_nonsingleton_unitigs.py <canu_unassembled.fasta>; select contigs from FASTA file that do not have reads=1 on their header; creates <canu_unassembled_unitigs.fasta> file and runs n50 script" return 0 count = 0 fas = Fasta(prefix) ofa = open(prefix[:-6] + '_unitigs.fasta', 'w') for x in sorted(fas.keys()): # process all the contigs one by one if "reads=1" in x: continue #print 'Selecting',x ofa.write('>' + x + '\n') ofa.write(fas[x][:] + '\n') # entire contig count += 1 print 'Selected', count, 'contigs with at least 2 reads' ofa.close() os.system("/home/stelo/bin/n50 -f " + prefix[:-6] + "_unitigs.fasta") os.system("rm -f *.flat *.gdx")
def align(): hg19 = Fasta('hg19.fa') print hg19.keys() hg19Chr = sorted(hg19.keys(), reverse=True) YRI = Fasta('YRIref.fasta') print YRI.keys() YRIChr = sorted(YRI.keys()) print hg19[hg19Chr[0]][:20] print YRI[YRIChr[0]][:20] print hg19[hg19Chr[0]][:20] print YRI[YRIChr[0]][:20] fhout = open('hg19_YRI_diff.bed', 'w') header = 'chrom, chromStart, chromEnd, hg19, YRI \n' fhout.write(header) for each in hg19Chr: seq1 = hg19[each][:10000] seq2 = YRI[each][:10000] print 'reached 1' print 'doing alignment for ', each alignment = nw.global_align(seq1, seq2, gap=-2, matrix=None, match=1, mismatch=-1) print 'reached 2' len1 = len(alignment[0]) #hg19 len2 = len(alignment[1]) #YRI if len2>len1: x = len2 else: x = len1 for i in range(x): if alignment[0][i] != alignment[1][i]: #write to fhout outline = each + ',' + str(i) + ',' + str(i+1) + ',' + alignment[0][i] + ',' + alignment[1][i] + '\n' fhout.write(outline) fhout.close()
#Usage python GC_from_fasta file [window_size] from collections import Counter from pyfasta import Fasta import sys f = Fasta(sys.argv[1], key_fn=lambda key: key.split()[0]) window_size = 301 if len(sys.argv) < 3 else int(sys.argv[2]) if not (window_size % 2): window_size += 1 out = open(sys.argv[1]+'.GC', 'w') for chrom in f.keys(): print chrom length = len(f[chrom]) start = 0 while start < length: c = Counter(f[chrom][start:start+window_size]) try: out.write('\t'.join(map(str, [chrom, start + (window_size-1)/2, float(c['G'] + c['C'] + c['g'] + c['c']) / float(c['G'] + c['C'] + c['g'] + c['c']+ c['t'] + c['T'] + c['a'] + c['A'])])) + '\n') except ZeroDivisionError: pass start += window_size
#version 1.1 此版本使用pyfasta实现。 import sys, os from pyfasta import Fasta if len(sys.argv) != 3: print 'Usage: *.py inputFile outputFile' sys.exit(0) inputFile = sys.argv[1] outputFile = sys.argv[2] def writeFile(text, files): with open(files, 'a') as f: f.write(text) if os.path.isfile(inputFile): f = Fasta(inputFile) for key in f.keys(): writeFile(">" + key + os.linesep, outputFile) content = f.sequence( { 'chr': key, 'start': 0, 'stop': len(f[key]) - 1, 'strand': '-' }, one_based=False) writeFile(content + os.linesep, outputFile) else: print '您输入的不是一个文件'
#pfam(key) - uniprot(value list) dictionary with open("Metaproteome_pfam_forDMI.tab", "r") as legionella_domains_table: legionella_domains_table.readline() pfam_uniprot = {} for line in legionella_domains_table: line = line.strip().split("\t") if "," in line[1]: line[1] = line[1].split(",") for pfam in line[1]: if pfam not in pfam_uniprot: pfam_uniprot[pfam] = [] pfam_uniprot[pfam].append(line[0]) #uniprot(key) - motif(value list) dictionary uniprot_motif = {} for key in human.keys(): for motif in elm_regex: match = re.search(str(elm_regex[motif]), str(human[key])) if match: if key not in uniprot_motif: uniprot_motif[key] = [] #print("%s;%s;%s"%(motif,match.start(),match.end())) uniprot_motif[key].append( (motif, str(match.start()), str(match.end()))) with open("MPDMIresult.tsv", "w") as output: for pfam, uniprot_list in pfam_uniprot.items(): for uniprot in uniprot_list: for motif in motif_domain: if pfam in motif_domain[motif]: for uni, motif_list in uniprot_motif.items():
def main(): args = check_options(get_options()) genomesize = int(os.path.getsize(args.genome)/1e6) kmer = int(log(genomesize, 4)+1) if kmer < 17: kmer = 17 #jellyfish par lowercount = 2 #jellyfish par jfsize = '100M' # splite sequence longer than 10M spsize = 10000000 step = args.step maxkmerscore = int(((args.length * args.homology / 100) - kmer) * args.ploidy/2 + 0.5 ) jfpool = Pool(args.threads) # ?build kmerindex jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf')) kmerbuild = True if os.path.isfile(jfkmerfile): if not args.docker: print("find:", jfkmerfile) kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it? Press Y or N to continue:" print(kmmess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': kmerbuild = True elif char == 'n': kmerbuild = False break # ?build bwa index bwaindexfile = os.path.basename(args.genome) bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa') bwaindex = os.path.join(args.saved, bwaindexfile) bwabuild = True if os.path.isfile(bwatestindex): if not args.docker: print('find:', bwatestindex) bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:" print(bwamess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': bwabuild = True elif char == 'n': bwabuild = False break print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:", jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads) # Build Jellyfish index if kmerbuild: jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile, threads=args.threads, lowercount=lowercount, size=jfsize) if jfcount: print("JellyFish Count finished ...") else: print("JellyFish Count Error!!!") sys.exit(1) else: print("Use ", jfkmerfile) # End build Jellyfish index if bwabuild: bwa.bwaindex(args.bwa, args.genome, args.saved) print("bwa index build finished ...") else: print("Use", bwatestindex) jffilteredprobe = list() ##### if genomesize < 1000: fastain = Fasta(args.input) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen/spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) jffinished = 0 print(len(jffpbrunerlist)) for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) jffinished += 1 print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='') jfpool.close() print('Jellyfish filter finished!!') else: ### split fa file when geome size greater than 1 Gb print("genome size > 1G") subFas = spgenome.spgenome(args.input, args.saved) for subFafile in subFas: print(subFafile) fastain = Fasta(subFafile) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen / spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) jffinished = 0 print(len(jffpbrunerlist)) for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) jffinished += 1 print(subFafile + " Jellyfish filter: ", jffinished, '/', len(jffpbrunerlist), sep='') jfpool.close() print('Jellyfish filter finished!!') tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>','seq',seqnum, sep='',file=tmppbfaio) print(tmppb,file=tmppbfaio) seqnum += 1 tmppbfaio.close() del jffilteredprobe bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length, maxxs=int(args.length*args.homology/100), threadnumber=args.threads) # print(bwafiltedpb) tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed') alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed') tmpbwaftlistio = open(tmpbwaftlist,'w') allbwaftlistio = open(alltmpbwaftlist,'w') seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len') seqlenio = open(seqlenfile,'w') seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = args.dtm nowpbcounter['rprimer'] = args.primer oligobefortmf.append(nowpbcounter) keepedprobe = list() ctedpb = 0 oligobefortmflen = len(oligobefortmf) print("oligobefortmflen:",oligobefortmflen) pbftpool = Pool() for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: print(ctedpb,'/',oligobefortmflen) pbdictbychr = dict() pbftpool.close() for pb in keepedprobe: seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq lenrprimer = len(args.primer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer+args.length for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + args.length - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t') if startnow > startn+slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() print("Job finshed!!")
def get_aln_size(consensus_ref): f = Fasta(consensus_ref) assert len(f) == 1 return len(f[f.keys()[0]])
# Get the fasta name file from command line inputfilename = sys.argv[1] print 'Input fasta file: ', inputfilename # Lendo arquivo de entrada print 'Loading fasta file...' f = Fasta(inputfilename) # Getting all keys KEYS = sorted( f.keys() ) # Now we will discard everything that is larger than a certain # threshold defined by the following variable Size_threshold = 9000 Maiores = {} Menores = {} # We now split them into two dictionaries depedinding on their # size. for j in KEYS: if len( f[j] ) > Size_threshold:
import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt try: inpFasta = sys.argv[1] except IndexError: print "Arguments: fasta_file" sys.exit(1) # 100 bp window. window = 100 fa = Fasta(inpFasta) for seqid in fa.keys(): # get sequence as a numpy array with dtype='c'--char seq = np.array(fa[seqid], dtype='c') seq gcs = (seq == 'C') | (seq == 'G') gcs # cast the booleans to ints. gcs = gcs.astype(np.uint8) gcs kern = np.ones(window)/ window kern # same has boundary effects but output array is same length as seq
# dumb map m = {"Notch2NL-C_Notch2NL-D": ("Notch2", "Notch2NL-A", "Notch2NL-B"), "Notch2NL-D": ("Notch2", "Notch2NL-A", "Notch2NL-B", "Notch2NL-C"), "all": ("Notch2", "Notch2NL-A", "Notch2NL-B", "Notch2NL-C", "Notch2NL-D")} regions = {frozenset(["Notch2NL-D"]): [[0, 15866], [74917, 81068], [162369, 165396]], frozenset(["Notch2NL-D", "Notch2NL-C"]): [[15867, 74916]], frozenset(): [[81069, 162368], [165397, 2000000]]} f = Fasta("stitched_alignment.fa") results = {} for exclude in [frozenset(), frozenset(["Notch2NL-D"]), frozenset(["Notch2NL-D", "Notch2NL-C"])]: t = open("tmp.fasta", "w") for para in sorted(set(f.keys()) - exclude): t.write(">{}\n{}\n".format(para, f[para])) t.close() n = '_'.join(sorted(exclude)) if len(exclude) > 0 else 'all' cmd = ['java', '-jar', '/cluster/home/ifiddes/jvarkit/dist-1.133/biostar94573.jar', '-R', n, 'tmp.fasta'] r = callProcLines(cmd) recs = [x.split() for x in r if not x.startswith("#")] results[exclude] = recs raw_recs = [] for exclude, region in regions.iteritems(): for start, stop in region: raw_recs.extend([x for x in results[exclude] if start < int(x[1]) <= stop])
from d2 import d2 from phylum_data import PHYLUM_DATA from pyfasta import Fasta K = 25 seq_data = {} scores = {} metadata = {} i = 0 for filename in glob(getenv("DATA_DIR", "data") + "/*.fna"): fasta = Fasta(filename) key = sorted(fasta.keys())[0] genbank_id = key.split(" ")[0] short_name = " ".join(key.split(" ")[1:3]) org_phylum_data = PHYLUM_DATA.get(short_name, {}) name = " ".join(key.split(" ")[1:-2])[:-1] metadata[genbank_id] = { "name": name, "phylum": org_phylum_data.get("phylum", ""), "domain": org_phylum_data.get("domain", ""), "ncbiLevel3": org_phylum_data.get("ncbiLevel3", "") } seq_data[genbank_id] = fasta[key][:]
def main(): try: ### steps ### ## load genome ## load mod blat ## iterate over blat hits and extract genomic sequences regarding the given fragment sizes (transcript and LTR) at the reference position ### 2 cases: strand + or - ## export bed with sequence coordinates to extract ## export fasta output using pybedtools ### seq_id: Qname ; Tname ; LTR size; transcript size ; total size; RC if strand "-" ### seq ## load genome logger.info("Loading fasta genome ...") if stat(args.genome).st_size == 0: logger.error("genome file is empty: " + args.genome ) sys.exit(1) else: fasta = Fasta(args.genome) logger.info("genome file: " + args.genome) logger.info("number of reference sequences: " + str(len(sorted(fasta.keys())))) ## load mod blat logger.info("Loading modblat ...") if stat(args.modblat).st_size == 0: logger.error("modblat file is empty: " + args.modblat ) sys.exit(1) else: logger.info("mod blat file: " + args.modblat) mb = ModBlat(args.modblat) logger.info("number of blat hits: " + str(len(mb.hits))) for hit in mb.hits: logger.log(0, "qname/tname pair: " + str(hit.qname) + "/" + str(hit.tname)) ## compute genomic coordinates logger.info("Compute genomic bed items coordinates ...") bedItems= [] for hit in mb.hits: bi = hit.computeGenomicSequenceBedItem(args.upstream_frag_sz, args.downstream_frag_sz) bedItems.append(bi.totuple()) logger.info("number of bed items: " + str(len(bedItems))) ## export bed items to bed file logger.info("Export to bed file ...") bed = pybedtools.BedTool(bedItems) outfile = path.basename(path.splitext(args.modblat)[0]) + '_seqFlankBlatHit.bed' bed.saveas(outfile, trackline="track name='genomic sequence extraction flanking blat hit' color=128,0,0") num_lines = sum(1 for line in open(outfile)) logger.info("number of lines in bed file: " + str(num_lines)) ## get fasta sequence from bed logger.info("Get fasta sequences from bed ...") fasta_out = path.basename(path.splitext(args.modblat)[0]) + '_seqFlankBlatHit.fasta' bed = bed.sequence(fi=args.genome, s=True, name=True) bedout = bed.save_seqs(fasta_out) assert open(bedout.seqfn).read() == open(bed.seqfn).read() fout = Fasta(fasta_out) logger.info("flanking blat hits sequences file: " + fasta_out) logger.info("number of flanking sequences: " + str(len(sorted(fout.keys())))) except KeyboardInterrupt: print "Shutdown requested...exiting" except Exception: traceback.print_exc(file=sys.stdout)
def run(self): if self.kmerbuild: jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath, mer=self.kmer, infile=self.genomefile, output=self.jfkmerfile, threads=self.threadsnumber, lowercount=self.lowercount, size=self.size) """ check jelly fish count run correctly """ if jfcounter: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit("JellyFish Count finished...") else: self.notifyMessage.emit("JellyFish Count Error!!!") else: jfcountmess = "Use " + self.jfkmerfile self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit(jfcountmess) if self.indexbuild: if self.aligner == 'BWA': bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder) self.notifyMessage.emit("BWA Index build finished...") self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) elif self.aligner == 'BLAT': """ add code for BLAT """ pass else: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) """ load and splite input file """ # splite sequence longer than 10M spsize = 10000000 maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer jffilteredprobe = list() fastain = Fasta(self.inputfile) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen / spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) jffinished = 0 for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) tmpprogress = float(format(self.progressnumber + (jffinished/len(jffpbrunerlist) * 40),".2f")) self.notifyProgress.emit(tmpprogress) if self.isRunning(): print("running") else: print("not running") jffinished += 1 self.notifyMessage.emit('jelly fish finished!!') self.progressnumber = 50.0 self.notifyProgress.emit(self.progressnumber) tmppbfa = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_tmp_probes.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>','seq',seqnum, sep='',file=tmppbfaio) print(tmppb,file=tmppbfaio) seqnum += 1 tmppbfaio.close() #delete jffilteredprobe and release memory del jffilteredprobe bwaindexfile = os.path.join(self.samplefolder, os.path.basename(self.genomefile)) bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath, reffile=bwaindexfile, inputfile=tmppbfa, minas=self.pblength, maxxs=int(self.pblength * self.homology / 100), threadnumber=self.threadsnumber) tmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'.bed') alltmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_all.bed') tmpbwaftlistio = open(tmpbwaftlist,'w') allbwaftlistio = open(alltmpbwaftlist,'w') seqlenfile = os.path.join(self.samplefolder, os.path.basename(self.inputfile))+'.len' seqlenio = open(seqlenfile, 'w') seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = self.dTm nowpbcounter['rprimer'] = self.rprimer oligobefortmf.append(nowpbcounter) keepedprobe = list() self.progressnumber = 55 self.notifyProgress.emit(self.progressnumber) ctedpb = 0 oligobefortmflen = len(oligobefortmf) for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: tmpprogress = float(format(self.progressnumber + (ctedpb/oligobefortmflen * 30),".2f")) self.notifyProgress.emit(tmpprogress) self.notifyProgress.emit(90) pbdictbychr = dict() #load pb to dict for pb in keepedprobe: # print(pb, file=tmpbwaftlistio) seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq #get lenth of primer lenrprimer = len(self.rprimer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer+self.pblength for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + self.pblength - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t') if startnow > startn+slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() #remove temp fasta file # os.remove(tmppbfa) self.notifyProgress.emit(100) self.notifyMessage.emit('all finished!!')
def dmi(bacterial_input, bacterial_id_col, bacterial_pf_col, human_receptors_DMI, output_file_path): bacterial_id_col = bacterial_id_col - 1 bacterial_pf_col = bacterial_pf_col - 1 # def rename(fasta_key): # fasta_key = fasta_key.split("|") # fasta_key = fasta_key[0] # return fasta_key # fasta processing -> human.keys() print the keys, human[key_name] print the sequence human = Fasta(human_receptors_DMI) #'human_receptors.fasta') #elm identifier(key) - regex(value) dictionary with open("elm_motif.tsv", "r") as motif_table: motif_table.readline() elm_regex = {} for line in motif_table: line = line.strip().split("\t") elm_regex[line[1]] = line[4] #motif(key) - domain(value list) dictionary with open("elm_interaction_domains.tsv", "r") as motif_domain_table: motif_domain_table.readline() motif_domain = {} for line in motif_domain_table: line = line.strip("\n").split("\t") if line[0] not in motif_domain: motif_domain[line[0]] = [] motif_domain[line[0]].append(line[1]) #pfam(key) - uniprot(value list) dictionary with open(bacterial_input, "r") as bacterial_proteins: bacterial_proteins.readline() bacterial_proteins = [ a.strip().split("\t") for a in bacterial_proteins ] pfam_uniprot = dict([(a[bacterial_pf_col], []) for a in bacterial_proteins]) for line in bacterial_proteins: pfam_uniprot[line[bacterial_pf_col]].append(line[bacterial_id_col]) #uniprot(key) - motif(value list) dictionary uniprot_motif = {} for key in human.keys(): for motif in elm_regex: match = re.search(str(elm_regex[motif]), str(human[key])) if match: if key not in uniprot_motif: uniprot_motif[key] = [] #print("%s;%s;%s"%(motif,match.start(),match.end())) uniprot_motif[key].append( (motif, str(match.start()), str(match.end()))) with open(output_file_path, "w") as output: predictions = 0 for pfam, uniprot_list in pfam_uniprot.items(): for uniprot in uniprot_list: for motif in motif_domain: if pfam in motif_domain[motif]: for uni, motif_list in uniprot_motif.items(): for motif_2 in motif_list: if motif_2[0] == motif: predictions += 1 output.write(uni + ";" + ";".join(motif_2) + ";" + ";" + pfam + ";" + uniprot + "\n") return predictions
sys.exit(0) #process arguments genome1 = arg[0] genome2 = arg[1] genomeOut = arg[2] #open genomeOut file to write new genome gOut = open(genomeOut,'w') #open both genomes as pyFasta arrays Fgenome1 = Fasta(genome1) Fgenome2 = Fasta(genome2) #get chromosome names chroms = Fgenome1.keys() #for each chromosome for chrom in chroms: #convert pyFasta arrays to numpy arrays np_genome1 = np.array(Fgenome1[chrom]) np_genome2 = np.array(Fgenome2[chrom]) #get Boolean array from elementwise comparison of chromosomes chrom_matches = np.core.defchararray.equal(np_genome1,np_genome2) #make new array of size of chrom, fill with N's
# Import MAC fasta file logComment('Importing MAC fasta file...') mac_fasta = None try: mac_fasta = Fasta(MACfile) except Exception as e: print("Error while importing fasta file\n" + str(e)) logComment("Can't import fasta file\n" + str(e)) exit() #if DEBUGGING: # print(list(mac_fasta.keys())) # Record number of imported MAC contigs macCount = len(mac_fasta.keys()) logComment(str(macCount) + ' sequences imported') # Rough Blast parameters dust = "yes" if Options['RoughBlastDust'] else "no" ungapped = " -ungapped " if Options['RoughBlastUngapped'] else "" maskLowercase = " -lcase_masking " if Options['BlastMaskLowercase'] else "" logComment("BLAST rough pass parameters:\nblastn -task " + Options['RoughBlastTask'] + " -word_size " + str(Options['RoughBlastWordSize']) + " -max_hsps 0 " + "-max_target_seqs 10000 -dust " + dust + ungapped + maskLowercase + "-num_threads " + str(Options['ThreadCount']) + " -outfmt \"10 qseqid sseqid pident length mismatch qstart qend sstart send evalue bitscore qcovs\"\n") #Fine Blast parameters dust = "yes" if Options['FineBlastDust'] else "no" ungapped = " -ungapped " if Options['FineBlastUngapped'] else ""
def main(): # Functions for jtools: # Get sequence # Detect tandem acceptors NAGNAG # Annotate with genes # Jiggle # Bed to juncid # Guess frame # Find stops in intron + in frame # SVM recomputes # Splice site strength? ppt? #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load a fasta file #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ global f # Opening fasta filehandle print >> sys.stderr, "[%s] Opening fasta file" % (spanki_utils.timestamp()) f = Fasta(fastafile) fastachr = set(sorted(f.keys())) #print fastachr ######################################################## ### Parsing a juncbed file ######################################################## if (juncbedfile): print "juncid\toriginal_id\tdastring" print >> sys.stderr, "Loading", juncbedfile lines = csv.reader(open(juncbedfile, 'rb'), delimiter='\t') z = [] for line in lines: pattern = re.compile('track') track = pattern.search(line[0]) if not track: values = line blocksizes = values[10].split(",") blockstarts = values[11].split(",") chr = values[0] rangestart = int(values[1]) - 1 rangeend = int(values[2]) strand = values[5] id = values[3] intronstart = rangestart + int(blocksizes[0]) + 2 intronend = rangeend - int(blocksizes[1]) # Or.. #intronend = rangestart + int(blocksizes[0]) + int(blockstarts[1]) #chrXHet 800 1767 JUNC00000001 2 + 800 1767 255,0,0 2 20,63 0,904 intronsize = intronend - intronstart; juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand dastring = intron_sequence_single(juncid,f) z.append(str(dastring)) print juncid, values[3], dastring print >> sys.stderr, "Distribution of detected motifs:\n",Counter(z) quit("Done") ######################################################## ### Parsing a intronbed file ######################################################## #scaffold_12916 13833982 13834044 10 #scaffold_12916 13838614 13838676 67 #scaffold_12916 13839119 13839204 75 if (intronbedfile): print "juncid\tid\tdastring" lines = csv.reader(open(intronbedfile, 'rb'), delimiter='\t') for line in lines: pattern = re.compile('track') track = pattern.search(line[0]) values = line if not track: chr = values[0] intronstart = int(values[1]) + 1 intronend = int(values[2]) - 1 strand = "+" id = values[0] intronsize = intronend - intronstart; juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand dastring = intron_sequence_single(juncid,f) print juncid, values[3], dastring quit("Done") ######################################################## ######################################################## ### Converting from another format ######################################################## if gfffile: #reflist = tab_to_dict(gff) results = collections.defaultdict(lambda : collections.defaultdict(dict)) gffdict = gff_to_dict(gfffile) for x in gffdict: #print x #print gffdict[x] if (gffdict[x]['feature_type'] == "exon_junction"): juncid = gffdict[x]['chr'] + ":" + str(int(gffdict[x]['start']) + 1) + "_" + str(int(gffdict[x]['end']) - 1) + ":" + gffdict[x]['strand'] elif (gffdict[x]['feature_type'] == "intron"): juncid = gffdict[x]['chr'] + ":" + gffdict[x]['start'] + "_" + gffdict[x]['end'] + ":" + gffdict[x]['strand'] dastring = intron_sequence_single(juncid,f) #print dastring results[x]['juncid'] = juncid results[x]['dastring'] = dastring print "ID\tjuncid\tdastring" for x in sorted(results.iterkeys()): print x, "\t", results[x]['juncid'], "\t", results[x]['dastring'] quit() ######################################################## ### Converting from another format ######################################################## if gtffile: #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Intializing the reference #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # You need the gtf file, and the fasta file lookup = spanki_utils.prep_ref(gtffile,fastafile,output_dir) ## Note that you now have a reference called ref.bam, and a lookup dict #tmp_dir = output_dir + "/tmp/" #reffile = tmp_dir + "/ref.bam" reffile = "tmp/ref.bam" #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load an annotation, flattened as bam #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print >> sys.stderr, "[%s] Trying to load annotation as bam" % (spanki_utils.timestamp()) reffh = pysam.Samfile( reffile, "rb" ) edgedict, refjuncs = spanki_parse_utils.parseRefAsBam(reffh) reffh.close() print >> sys.stderr, "[%s] Done loading annotation as bam" % (spanki_utils.timestamp()) for junc in refjuncs: print junc quit() ### Below are functions that operate on a junction list ######################################################## if jlist: #~~~~~~~~~~~~~~~~~~~ # Load reference junction list #~~~~~~~~~~~~~~~~~~~ reflist = tab_to_dict(jlist) # Find the junctions in jlist that are not in jtab myjuncs = reflist.keys() print >> sys.stderr, len(myjuncs), "in junction list" updonor = 20 downdonor = 2 upacceptor = 2 downacceptor = 20 for x in myjuncs: print x j1 = Junctionid(x) j1.display() if j1.strand == "+": #print Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna) tempseq = Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna) #print "***", tempseq.translate() #print Seq(f[j1.chr][j1.donor:j1.donor + downdonor], IUPAC.unambiguous_dna) #print Seq(f[j1.chr][j1.acceptor-upacceptor:j1.acceptor], IUPAC.unambiguous_dna) #print Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna) nagstring = find_nag(Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna)) print nagstring elif j1.strand == "-": pass #print Seq(f[j1.chr][j1.donor:j1.donor + updonor], IUPAC.unambiguous_dna).reverse_complement() #print Seq(f[j1.chr][j1.donor - downdonor:j1.donor], IUPAC.unambiguous_dna).reverse_complement() #print Seq(f[j1.chr][j1.acceptor:j1.acceptor + upacceptor], IUPAC.unambiguous_dna).reverse_complement() #print Seq(f[j1.chr][j1.acceptor-downacceptor:j1.acceptor], IUPAC.unambiguous_dna).reverse_complement() else: quit("Don't recognize strand") #fiveprimeflank = fiveprimeflank.reverse_complement() quit("Done") quit() #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Older code that's not used yet #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # IRT #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ bamfh = pysam.Samfile( bamfile, "rb" ) #for alignedread in samfile: # Need some kind of iterator to getread length from first alignment in sam print >> sys.stderr, "[%s] Getting intron read-though (IRT), may take awhile" % (spanki_utils.timestamp()) IRT = intron_readthrough(myjuncs,bamfh) bamfh.close() print >> sys.stderr, "[%s] Done getting IRT" % (spanki_utils.timestamp()) #for edgeid in covbyedge.keys(): # print edgeid, covbyedge[edgeid] # These are the fields you end up with after merging: #juncid geneassign cov lirt rirt irt dncov ancov numsamps #chr2L:22427471_22427525:- none 2 57 28 85 0 0 1 #chr2R:5702257_5702656:+ FBgn0040092 13 0 0 0 0 0 2 #chr2L:11436293_11436415:- FBgn0261648 23 0 0 0 0 0 2 #chr2R:9334834_9336812:- FBgn0013765 6 0 0 0 0 0 2 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Now compile the results #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # First show how you can get in hte myjuncs list print >> sys.stderr, "Printing results table" print >> juncs_out, "juncid\tgeneassign\tannostatus\tintron_size\tgmcode\tregcode\tcov\tlirt\trirt\tirt\tdncov\tancov" for juncid in sorted(keys2): try: results = [juncid, jdict[juncid]['geneassign'], jdict[juncid]['annostatus'], jdict[juncid]['intron_size'], jdict[juncid]['gmcode'], jdict[juncid]['regcode'], jdict[juncid]['cov'], jdict[juncid]['lirt'], jdict[juncid]['rirt'], jdict[juncid]['irt'], jdict[juncid]['dncov'], jdict[juncid]['ancov']] print >> juncs_out, ('\t'.join(map(str,results))) except KeyError: #myjuncs.append(juncid) j1 = Junctionid(juncid) donid = j1.donid accid = j1.accid if covbyedge[donid]: dncov = covbyedge[donid] else: dncov = 0 if covbyedge[accid]: ancov = covbyedge[accid] else: ancov = 0 results = [juncid, reflist[juncid]['geneassign'], reflist[juncid]['annostatus'], reflist[juncid]['intron_size'], reflist[juncid]['gmcode'], reflist[juncid]['regcode'], 0, IRT[juncid]['lirt'], IRT[juncid]['rirt'], IRT[juncid]['irt'], dncov, ancov] #print(results, sep='\t') print >> juncs_out, ('\t'.join(map(str,results))) quit("done") #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Parse the read alignments #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Parse the bam file ## Get a table of junctions, table of donors etc. bamfh = pysam.Samfile( bamfile, "rb" ) #JTAB,UNFILT_JTAB,STAB,NEWDTAB,MMES = parse_aligns_detailed(bamfh) JTAB,UNFILT_JTAB = quickcov(bamfh,anchorsize) bamfh.close() myjuncs = JTAB.keys() myjuncs.sort() #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Print junction list to the output directory #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print "juncid\tunfilt_cov\tcov" for juncid in myjuncs: print juncid, UNFILT_JTAB[juncid], JTAB[juncid]
def main(): args = check_options(get_options()) genomesize = int(os.path.getsize(args.genome)/1e6) kmer = int(log(genomesize, 4)+1) if kmer < 17: kmer = 17 #jellyfish par lowercount = 2 #jellyfish par jfsize = '100M' # splite sequence longer than 10M spsize = 10000000 step = args.step maxkmerscore = int(args.length * args.homology / 100) - kmer jfpool = Pool(args.threads) # ?build kmerindex jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf')) kmerbuild = True if os.path.isfile(jfkmerfile): if not args.docker: print("find:", jfkmerfile) kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it? Press Y or N to continue:" print(kmmess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': kmerbuild = True elif char == 'n': kmerbuild = False break # ?build bwa index bwaindexfile = os.path.basename(args.genome) bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa') bwaindex = os.path.join(args.saved, bwaindexfile) bwabuild = True if os.path.isfile(bwatestindex): if not args.docker: print('find:', bwatestindex) bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:" print(bwamess) while True: char = getch() if char.lower() in ("y", "n"): print(char) if char == 'y': bwabuild = True elif char == 'n': bwabuild = False break print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:", jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads) # Build Jellyfish index if kmerbuild: jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile, threads=args.threads, lowercount=lowercount, size=jfsize) if jfcount: print("JellyFish Count finished ...") else: print("JellyFish Count Error!!!") sys.exit(1) else: print("Use ", jfkmerfile) # End build Jellyfish index if bwabuild: bwa.bwaindex(args.bwa, args.genome, args.saved) print("bwa index build finished ...") else: print("Use", bwatestindex) jffilteredprobe = list() fastain = Fasta(args.input) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen/spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer, pyfasta=fastain, seqname=seqname, pblength=args.length, maxkmerscore=maxkmerscore, start=start, end=end, step=step) jffpbrunerlist.append(jffpbruner) jffinished = 0 for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) jffinished += 1 print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='') jfpool.close() print('Jellyfish filter finished!!') tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>','seq',seqnum, sep='',file=tmppbfaio) print(tmppb,file=tmppbfaio) seqnum += 1 tmppbfaio.close() del jffilteredprobe bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length, maxxs=int(args.length*args.homology/100), threadnumber=args.threads) # print(bwafiltedpb) tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed') alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed') tmpbwaftlistio = open(tmpbwaftlist,'w') allbwaftlistio = open(alltmpbwaftlist,'w') seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len') seqlenio = open(seqlenfile,'w') seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = args.dtm nowpbcounter['rprimer'] = args.primer oligobefortmf.append(nowpbcounter) keepedprobe = list() ctedpb = 0 oligobefortmflen = len(oligobefortmf) print("oligobefortmflen:",oligobefortmflen) pbftpool = Pool() for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: print(ctedpb,'/',oligobefortmflen) pbdictbychr = dict() pbftpool.close() for pb in keepedprobe: seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq lenrprimer = len(args.primer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer+args.length for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + args.length - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t') if startnow > startn+slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() print("Job finshed!!")
def acquire_chr(ref_genome): file = Fasta(ref_genome) return sorted(file.keys())
def random_sequence(file): fasta = Fasta(file) key = choice(fasta.keys()) return (key, fasta[key])
def run(self): if self.kmerbuild: jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath, mer=self.kmer, infile=self.genomefile, output=self.jfkmerfile, threads=self.threadsnumber, lowercount=self.lowercount, size=self.size) """ check jelly fish count run correctly """ if jfcounter: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit("JellyFish Count finished...") else: self.notifyMessage.emit("JellyFish Count Error!!!") else: jfcountmess = "Use " + self.jfkmerfile self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) self.notifyMessage.emit(jfcountmess) if self.indexbuild: if self.aligner == 'BWA': bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder) self.notifyMessage.emit("BWA Index build finished...") self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) elif self.aligner == 'BLAT': """ add code for BLAT """ pass else: self.progressnumber = self.progressnumber + 5 self.notifyProgress.emit(self.progressnumber) """ load and splite input file """ # splite sequence longer than 10M spsize = 10000000 maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer jffilteredprobe = list() fastain = Fasta(self.inputfile) jffpbrunerlist = list() for seqname in fastain.keys(): chrlen = len(fastain[seqname]) if chrlen < spsize: start = 0 end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) else: chrblock = int(chrlen / spsize) + 1 for i in range(chrblock): start = i * spsize end = start + spsize - 1 if end >= chrlen: end = chrlen - 1 jffpbruner = jellyfish.JFfpbruner( jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer, pyfasta=fastain, seqname=seqname, pblength=self.pblength, maxkmerscore=maxkmerscore, start=start, end=end, step=self.step) jffpbrunerlist.append(jffpbruner) jffinished = 0 for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist): jffilteredprobe.extend(curpblist) tmpprogress = float( format( self.progressnumber + (jffinished / len(jffpbrunerlist) * 40), ".2f")) self.notifyProgress.emit(tmpprogress) if self.isRunning(): print("running") else: print("not running") jffinished += 1 self.notifyMessage.emit('kmer filter finished!!') self.progressnumber = 50.0 self.notifyProgress.emit(self.progressnumber) tmppbfa = os.path.join( self.samplefolder, os.path.basename(self.inputfile) + '_tmp_probes.fa') tmppbfaio = open(tmppbfa, 'w') seqnum = 0 for tmppb in jffilteredprobe: print('>', 'seq', seqnum, sep='', file=tmppbfaio) print(tmppb, file=tmppbfaio) seqnum += 1 tmppbfaio.close() #delete jffilteredprobe and release memory del jffilteredprobe bwaindexfile = os.path.join(self.samplefolder, os.path.basename(self.genomefile)) bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath, reffile=bwaindexfile, inputfile=tmppbfa, minas=self.pblength, maxxs=int(self.pblength * self.homology / 100), threadnumber=self.threadsnumber) tmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile) + '.bed') alltmpbwaftlist = os.path.join( self.samplefolder, os.path.basename(self.inputfile) + '_all.bed') tmpbwaftlistio = open(tmpbwaftlist, 'w') allbwaftlistio = open(alltmpbwaftlist, 'w') seqlenfile = os.path.join(self.samplefolder, os.path.basename(self.inputfile)) + '.len' seqlenio = open(seqlenfile, 'w') seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile) for seqname in seqlength: print(seqname, seqlength[seqname], sep='\t', file=seqlenio) seqlenio.close() oligobefortmf = list() for pbtmp in bwafiltedpb: # print(pbtmp, file=tmpbwaftlistio) nowpbcounter = dict() nowpbcounter['seq'] = pbtmp nowpbcounter['dTm'] = self.dTm nowpbcounter['rprimer'] = self.rprimer oligobefortmf.append(nowpbcounter) keepedprobe = list() self.progressnumber = 55 self.notifyProgress.emit(self.progressnumber) ctedpb = 0 oligobefortmflen = len(oligobefortmf) for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf): if keep: keepedprobe.append(pb) # print(pb, file=tmpbwaftlistio) ctedpb += 1 if ctedpb % 10000 == 0: tmpprogress = float( format( self.progressnumber + (ctedpb / oligobefortmflen * 30), ".2f")) self.notifyProgress.emit(tmpprogress) self.notifyProgress.emit(90) pbdictbychr = dict() #load pb to dict for pb in keepedprobe: # print(pb, file=tmpbwaftlistio) seq, chro, start = pb.split('\t') start = int(start) if chro in pbdictbychr: pbdictbychr[chro][start] = seq else: pbdictbychr[chro] = dict() pbdictbychr[chro][start] = seq #get lenth of primer lenrprimer = len(self.rprimer) if lenrprimer == 0: lenrprimer = 5 slidwindow = lenrprimer + self.pblength for chro in pbdictbychr: startn = 0 for startnow in sorted(pbdictbychr[chro]): endnow = startnow + self.pblength - 1 print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=allbwaftlistio, sep='\t') if startnow > startn + slidwindow: #startn = startnow+slidwindow startn = startnow print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t') tmpbwaftlistio.close() allbwaftlistio.close() #remove temp fasta file # os.remove(tmppbfa) self.notifyProgress.emit(100) self.notifyMessage.emit('all finished!!')
def vcf_to_fasta(input_vcf, output_fasta, ref_seq, species, use_indels, min_depth, free_bayes, ploidy, to_fasta, main_sequence, coverage_files, min_probs=0.8, impute=False, unique_only=False): # First part is to get the fasta sequence then atke each position # and then alter the reference as necessary for each sample. # Because everyone will have different SNPs. f = Fasta(ref_seq) # For now this is only going to work with mtDNA sequences, # but plan to extend this in the future to full genome # gets the full genomes sequences and currently assumes # that the fasta only contains one sequence. min_depth = int(min_depth) ploidy = int(ploidy) if(impute): is_beagle = True index = [n for n, l in enumerate(f.keys()) if l.startswith(main_sequence)] index = index[0] full_sequence = list(str(f[f.keys()[index]])) min_max_coord = [] first_coordinate = True sample_fasta = {} unique_snps = {} if free_bayes or ploidy == 1: free_bayes = True ploidy = 1 if(impute): is_beagle = True free_bayes = False sample_lines = {} vcf_reader = vcf.Reader(open(input_vcf, 'r'), strict_whitespace=True) samples = vcf_reader.samples sample_offset = {} sample_offset_end= {} for sample in samples: sample_lines[sample] = [] sample_fasta[sample] = full_sequence[:] sample_offset[sample] = 0 sample_offset_end[sample] = {} for record in vcf_reader: position = record.POS if first_coordinate: min_max_coord.append(str(position)) first_coordinate = False for sample in record.samples: genotype = sample['GT'] is_beagle = False temp_position = position - 1 + sample_offset[sample.sample] try: pl = sample['PL'] pheno_l = [int(o) for o in pl] dp = sample['DP'] pl = pheno_l.index(min(pheno_l)) if genotype == None or float(dp) <= min_depth: sample_fasta[sample.sample][temp_position] = 'N' # Just to ensure, the bad thing doesn't occur # Overwriting the N call. continue except AttributeError: if not free_bayes: is_beagle = True gp = sample['GP'] g_l = [float(o) for o in gp] if max(g_l) < min_probs: #print sample sample_fasta[sample.sample][temp_position] = 'N' continue pl = g_l.index(max(g_l)) else: if genotype == '.' or genotype == None: sample_fasta[sample.sample][temp_position] = 'N' continue except TypeError: sample_fasta[sample.sample][temp_position] = 'N' continue sample = sample.sample if free_bayes or ploidy == 1: genotype = genotype[0] if genotype == '0': continue elif not is_beagle: genotype = genotype.split('/') else: genotype = genotype.split("|") # If pl is greater than zero ref = record.REF alt = record.ALT # Gl is substituted if free_bayes or int(pl) > 0: if is_ga_or_ct(ref, alt): if not free_bayes: if is_beagle: if g_l[0] > g_l[2]: continue elif pheno_l[0] < pheno_l[2]: continue no_alleles = 1 + len(alt) if not free_bayes: genotype = genotype[0] real_gt = str(alt[int(genotype)-1]) if real_gt == "*": sample_fasta[sample][temp_position] = "N" continue if to_fasta: if species == 'human': if position == 8270 and ref == "CACCCCCTCT": sample_fasta[sample][8280:8289] = '-'*9 continue for i in range(0, max(len(real_gt), len(ref))): if i == (len(real_gt) - 1) and i == (len(ref)- 1): gt = real_gt[i] if free_bayes and len(str(alt)) > 1: real_gt = str(alt[0]) #print(temp_position) sample_fasta[sample][temp_position] = gt elif len(real_gt) > len(ref) and i != 0: if use_indels: if temp_position == 2677: print real_gt print ref print real_gt[i] gt = list(real_gt[i]) sample_offset_end[sample][temp_position] = len(gt) temp_position = temp_position + 1 sample_fasta[sample] = \ sample_fasta[sample][:temp_position] + \ gt + sample_fasta[sample][temp_position:] sample_offset[sample] += 1 elif len(real_gt) < len(ref) and i != 0: sample_fasta[sample][temp_position + i] = '-' else: if species == 'human': if position == 955 and "ACCCC" in str(alt[0]): sample_lines[sample].extend(["960.1CCCCC"]) try: unique_snps["960.1CCCCC"] += 1 except KeyError: unique_snps["960.1CCCCC"] = 1 continue if position == 8270 and ref == "CACCCCCTCT": sample_lines[sample].extend([str(i)+"d" for i in range(8281, 8290)]) for item in [str(i) +"d" for i in range(8281, 8290)]: try: unique_snps[item] += 1 except KeyError: unique_snps[item] = 1 continue if position == 285 and ref == "CAA": sample_lines[sample].extend([str(i) + "d" for i in range(290, 293)]) for item in [str(i) +"d" for i in range(290, 293)]: try: unique_snps[item] += 1 except KeyError: unique_snps[item] = 1 continue if position == 247 and ref == "GA": sample_lines[sample].extend([str(249) + "d"]) item = str(249) + "d" try: unique_snps[item] += 1 except KeyError: unique_snps[item] = 1 continue for i in range(0, max(len(real_gt), len(ref))): if i == (len(real_gt) - 1) and i == (len(ref)- 1): gt = real_gt[i] if free_bayes and len(str(alt)) > 1: real_gt = str(alt[0]) sample_lines[sample].append(str(position+i) + gt) if unique_only: try: unique_snps[str(position+i) + gt] += 1 except KeyError: unique_snps[str(position+i) + gt] = 1 elif len(real_gt) > len(ref) and i != 0: gt = real_gt[i] sample_lines[sample].append(str(temp_position+i) + "." + str(i) + gt) if unique_only: try: unique_snps[str(temp_position+i) + "." + str(i) + gt] += 1 except KeyError: unique_snps[str(temp_position+i) + "." + str(i) + gt] = 1 temp_position = temp_position - 1 elif len(real_gt) < len(ref) and i != 0: sample_lines[sample].append(str(position+i) + "d") if unique_only: try: unique_snps[str(position+i) + "d"] += 1 except KeyError: unique_snps[str(position+i) + "d"] = 1 if to_fasta: sample_fasta_count_changes = {} for sample in samples: if not impute: for cov in coverage_files: if sample in cov: with open(cov) as coverage_f: start = 0 for line in coverage_f: s_line = line.split('\t') start_temp = int(s_line[1]) -1 while start_temp != start: sample_fasta[sample][start] = 'N' start += 1 coverage = int(s_line[3]) if coverage <= min_depth: sample_fasta[sample][start] = 'N' start += 1 else: for cov in coverage_files: if sample in cov: with open(cov) as coverage_f: start = 0 for line in coverage_f: s_line = line.split('\t') start_temp = int(s_line[1]) -1 while start_temp != start: try: sample_fasta_count_changes[start] += 1 except KeyError: sample_fasta_count_changes[start] = 1 #sample_fasta[sample][start] = 'N' start += 1 coverage = int(s_line[3]) if coverage <= min_depth: try: sample_fasta_count_changes[start] += 1 except KeyError: sample_fasta_count_changes[start] = 1 #sample_fasta[sample][start] = 'N' start += 1 # TODO make sure that this cannot get called when using the indels option if impute: for sample in samples: offset = 0 for i in range(0,len(sample_fasta[sample])): if i in sample_offset_end[sample]: offset += sample_offset_end[sample][i] try: temp_number = sample_fasta_count_changes[i] if temp_number == len(coverage_files): sample_fasta[sample][i+offset] = 'N' except KeyError: pass with open(output_fasta, 'w') as out: for sample in samples: out.write('>'+ sample + '\n') out.write("".join(sample_fasta[sample]) + '\n') else: if unique_only: unique_truth = {} for snp, count in unique_snps.items(): if count == len(sample_lines): unique_truth[snp] = False else: unique_truth[snp] = True min_max_coord.append(str(position)) with open(output_fasta, 'w') as hgrep_o: hgrep_o.write('SampleId\tRange\tHaploGroup\tPolymorphisms (delimited by tab)\n') for sample, substitions in sample_lines.items(): output_line = [] output_line.append(sample) output_line.append('-'.join(min_max_coord)) output_line.append("?") for sub in substitions: if unique_only: if unique_truth[sub] == True: output_line.append(sub) else: output_line.append(sub) output_line = "\t".join(output_line) + "\n" if len(output_line.split('\t')) == 3: continue hgrep_o.write(output_line)
def vcf_to_fasta(input_vcf, output_fasta, ref_seq, species, use_indels, min_depth, free_bayes, ploidy, to_fasta, main_sequence, coverage_files, min_probs=0.8, impute=False, unique_only=False): # First part is to get the fasta sequence then atke each position # and then alter the reference as necessary for each sample. # Because everyone will have different SNPs. f = Fasta(ref_seq) # For now this is only going to work with mtDNA sequences, # but plan to extend this in the future to full genome # gets the full genomes sequences and currently assumes # that the fasta only contains one sequence. min_depth = int(min_depth) ploidy = int(ploidy) if (impute): is_beagle = True index = [n for n, l in enumerate(f.keys()) if l.startswith(main_sequence)] index = index[0] full_sequence = list(str(f[f.keys()[index]])) min_max_coord = [] first_coordinate = True sample_fasta = {} unique_snps = {} if free_bayes or ploidy == 1: free_bayes = True ploidy = 1 if (impute): is_beagle = True free_bayes = False sample_lines = {} vcf_reader = vcf.Reader(open(input_vcf, 'r'), strict_whitespace=True) samples = vcf_reader.samples sample_offset = {} sample_offset_end = {} for sample in samples: sample_lines[sample] = [] sample_fasta[sample] = full_sequence[:] sample_offset[sample] = 0 sample_offset_end[sample] = {} for record in vcf_reader: position = record.POS if first_coordinate: min_max_coord.append(str(position)) first_coordinate = False for sample in record.samples: genotype = sample['GT'] is_beagle = False temp_position = position - 1 + sample_offset[sample.sample] try: pl = sample['PL'] pheno_l = [int(o) for o in pl] dp = sample['DP'] pl = pheno_l.index(min(pheno_l)) if genotype == None or float(dp) <= min_depth: sample_fasta[sample.sample][temp_position] = 'N' # Just to ensure, the bad thing doesn't occur # Overwriting the N call. continue except AttributeError: if not free_bayes: is_beagle = True gp = sample['GP'] g_l = [float(o) for o in gp] if max(g_l) < min_probs: print sample sample_fasta[sample.sample][temp_position] = 'N' continue pl = g_l.index(max(g_l)) else: if genotype == '.' or genotype == None: sample_fasta[sample.sample][temp_position] = 'N' continue except TypeError: sample_fasta[sample.sample][temp_position] = 'N' continue sample = sample.sample if free_bayes or ploidy == 1: genotype = genotype[0] if genotype == '0': continue elif not is_beagle: genotype = genotype.split('/') else: genotype = genotype.split("|") # If pl is greater than zero ref = record.REF alt = record.ALT # Gl is substituted if free_bayes or int(pl) > 0: if is_ga_or_ct(ref, alt): if not free_bayes: if is_beagle: if g_l[0] > g_l[2]: continue elif pheno_l[0] < pheno_l[2]: continue no_alleles = 1 + len(alt) if not free_bayes: genotype = genotype[0] real_gt = str(alt[int(genotype) - 1]) if to_fasta: if species == 'human': if position == 8270 and ref == "CACCCCCTCT": sample_fasta[sample][8280:8289] = '-' * 9 continue for i in range(0, max(len(real_gt), len(ref))): if i == (len(real_gt) - 1) and i == (len(ref) - 1): gt = real_gt[i] if free_bayes and len(str(alt)) > 1: real_gt = str(alt[0]) print(temp_position) sample_fasta[sample][temp_position] = gt elif len(real_gt) > len(ref) and i != 0: if use_indels: gt = list(real_gt[i]) sample_offset_end[sample][temp_position] = len( gt) temp_position = temp_position + 1 sample_fasta[sample] = \ sample_fasta[sample][:temp_position] + \ gt + sample_fasta[sample][temp_position:] sample_offset[sample] += 1 else: gt = real_gt[i] sample_fasta[sample][temp_position] = gt[0] elif len(real_gt) < len(ref) and i != 0: sample_fasta[sample][temp_position + i] = '-' else: if species == 'human': if position == 955 and "ACCCC" in str(alt[0]): sample_lines[sample].extend(["960.1CCCCC"]) try: unique_snps["960.1CCCCC"] += 1 except KeyError: unique_snps["960.1CCCCC"] = 1 continue if position == 8270 and ref == "CACCCCCTCT": sample_lines[sample].extend( [str(i) + "d" for i in range(8281, 8290)]) for item in [ str(i) + "d" for i in range(8281, 8290) ]: try: unique_snps[item] += 1 except KeyError: unique_snps[item] = 1 continue if position == 285 and ref == "CAA": sample_lines[sample].extend( [str(i) + "d" for i in range(290, 293)]) for item in [ str(i) + "d" for i in range(290, 293) ]: try: unique_snps[item] += 1 except KeyError: unique_snps[item] = 1 continue if position == 247 and ref == "GA": sample_lines[sample].extend([str(249) + "d"]) item = str(249) + "d" try: unique_snps[item] += 1 except KeyError: unique_snps[item] = 1 continue for i in range(0, max(len(real_gt), len(ref))): if i == (len(real_gt) - 1) and i == (len(ref) - 1): gt = real_gt[i] if free_bayes and len(str(alt)) > 1: real_gt = str(alt[0]) sample_lines[sample].append(str(position + i) + gt) if unique_only: try: unique_snps[str(position + i) + gt] += 1 except KeyError: unique_snps[str(position + i) + gt] = 1 elif len(real_gt) > len(ref) and i != 0: gt = real_gt[i] sample_lines[sample].append( str(temp_position + i) + "." + str(i) + gt) if unique_only: try: unique_snps[str(temp_position + i) + "." + str(i) + gt] += 1 except KeyError: unique_snps[str(temp_position + i) + "." + str(i) + gt] = 1 temp_position = temp_position - 1 elif len(real_gt) < len(ref) and i != 0: sample_lines[sample].append( str(position + i) + "d") if unique_only: try: unique_snps[str(position + i) + "d"] += 1 except KeyError: unique_snps[str(position + i) + "d"] = 1 if to_fasta: sample_fasta_count_changes = {} for sample in samples: if not impute: for cov in coverage_files: if sample in cov: with open(cov) as coverage_f: start = 0 for line in coverage_f: s_line = line.split('\t') start_temp = int(s_line[1]) - 1 while start_temp != start: sample_fasta[sample][start] = 'N' start += 1 coverage = int(s_line[3]) if coverage <= min_depth: sample_fasta[sample][start] = 'N' start += 1 else: for cov in coverage_files: if sample in cov: with open(cov) as coverage_f: start = 0 for line in coverage_f: s_line = line.split('\t') start_temp = int(s_line[1]) - 1 while start_temp != start: try: sample_fasta_count_changes[start] += 1 except KeyError: sample_fasta_count_changes[start] = 1 #sample_fasta[sample][start] = 'N' start += 1 coverage = int(s_line[3]) if coverage <= min_depth: try: sample_fasta_count_changes[start] += 1 except KeyError: sample_fasta_count_changes[start] = 1 #sample_fasta[sample][start] = 'N' start += 1 # TODO make sure that this cannot get called when using the indels option if impute: for sample in samples: offset = 0 for i in range(0, len(sample_fasta[sample])): if i in sample_offset_end[sample]: offset += sample_offset_end[sample][i] try: temp_number = sample_fasta_count_changes[i] if temp_number == len(coverage_files): sample_fasta[sample][i + offset] = 'N' except KeyError: pass with open(output_fasta, 'w') as out: for sample in samples: out.write('>' + sample + '\n') out.write("".join(sample_fasta[sample]) + '\n') else: if unique_only: unique_truth = {} for snp, count in unique_snps.items(): if count == len(sample_lines): unique_truth[snp] = False else: unique_truth[snp] = True min_max_coord.append(str(position)) with open(output_fasta, 'w') as hgrep_o: hgrep_o.write( 'SampleId\tRange\tHaploGroup\tPolymorphisms (delimited by tab)\n' ) for sample, substitions in sample_lines.items(): output_line = [] output_line.append(sample) output_line.append('-'.join(min_max_coord)) output_line.append("?") for sub in substitions: if unique_only: if unique_truth[sub] == True: output_line.append(sub) else: output_line.append(sub) output_line = "\t".join(output_line) + "\n" if len(output_line.split('\t')) == 3: continue hgrep_o.write(output_line)
def main(): """ cuts fasta file at specific location """ if len(sys.argv) == 2: prefix = sys.argv[1] else: print "Usage: python split.py <prefix>; assume that <prefix>_BspQI_key.txt <prefix>.fasta and <prefix>_cut_list.csv exist; output will be <prefix>_new.fasta; cut_list is <contigID>,<loc1>,[<loc2>] -- first line is scaling constant" return 0 ren = ReadTable(prefix+'_BspQI_key.txt', 4, '\t') # 4 lines of header #print ren cut = ReadTable(prefix+'_cut_list.csv', 0, ',') # no header (saved as MS-DOS csv via Excel) #print cut # create a dictionary between contig id and FASTA id x[1] and FAST len x[2] renaming = {} for x in ren: renaming[int(x[0])]=(x[1],int(x[2])) # contigs names are converted into integers, as well as length #print renaming # collect the names of the contigs to be cut location = {} scaling = float(cut[0][0]) print 'scaling constant',scaling for x in cut[1:]: index = int(x[0]) # name of the contig to cut, convert contig name into into integer so we can match it if index in renaming: if (len(x) == 2): l = int(round(float(x[1])/scaling)) # position to cut if (l > renaming[index][1]): # check the length print 'Error: cannot split contig',index,'at position',l,'because it is only',renaming[index][1],'bp long' sys.exit(-1) else: location[renaming[index][0]]=[l] # location[contig_name]->position elif (len(x) == 3): l1 = int(round(float(x[1])/scaling)) # position to cut l2 = int(round(float(x[2])/scaling)) # position to cut if (l1 > renaming[index][1]) or (l2 > renaming[index][1]): # check the length print 'Error: cannot split contig',index,'at position',l1,l2,'because it is only',renaming[index][1],'bp long' sys.exit(-1) else: location[renaming[index][0]]=[l1,l2] # location[contig_name]->position else: print 'Error: contig',index,'does not exist' sys.exit(-1) print location # open the fasta file for reading fas = Fasta(prefix+'.fasta') # open the new fasta file for writing ofa = open(prefix+'_new.fasta','w') for x in sorted(fas.keys()): # process all the contigs one by one if x in location: # if it needs to be split if len(location[x]) == 1: l = location[x][0] print 'Splitting',x,'at location',l ofa.write('>'+x+'|chimeric1\n') ofa.write(fas[x][:l]+'\n') # prefix ofa.write('>'+x+'|chimeric2\n') ofa.write(fas[x][l:]+'\n') # suffix elif len(location[x]) == 2: l1 = location[x][0] l2 = location[x][1] if (l1 > l2): temp = l2 l2 = l1 l1 = temp print 'Splitting',x,'at location',l1,'and',l2 ofa.write('>'+x+'|chimeric1\n') ofa.write(fas[x][:l1]+'\n') # prefix ofa.write('>'+x+'|chimeric2\n') ofa.write(fas[x][l1:l2]+'\n') # middle ofa.write('>'+x+'|chimeric3\n') ofa.write(fas[x][l2:]+'\n') # suffix else: #print 'Not splitting',x ofa.write('>'+x+'\n') ofa.write(fas[x][:]+'\n') ofa.close()
检测比对过的fasta文件中所有序列之间是否两两均具有重叠区域 ''' __version__ = "1.0" from pyfasta import Fasta import argparse #命令行选项处理 parser = argparse.ArgumentParser() parser.add_argument("-i", "-in", "--input", metavar="filename", dest="input", type=str , help="fasta file to check") parser.add_argument("-v", "--version", action='version', help="The version of this program.", version = "Version: " + __version__) args = parser.parse_args() f = Fasta(args.input) loci = sorted(f.keys()) for locus1 in loci: for locus2 in loci: flag = 0 sequence1 = f[locus1] sequence2 = f[locus2] i = 0 while i < len(sequence1) and i < len(sequence2): base1 = sequence1[i] base2 = sequence2[i] if base1 != "-" and base2 != "-": flag = 1 break i += 1 if flag == 0: print(locus1, "与", locus2, "不存在重叠序列!")
# FastA Parser from pyfasta import Fasta # Serializers from .serializers import RefQuerySerializer, RefGenomeSerializer # From JSON to Python Data Format from django.utils.six import BytesIO from rest_framework.parsers import JSONParser # From Serializer to JSON from rest_framework.renderers import JSONRenderer f = Fasta('geneticapi/templates/data/genbank.GRCh37.fa') FASTA_INDEX = sorted(f.keys(), reverse=True) class Error(Exception): """Base class for exceptions in this module.""" pass class ChromeParseException(Error): """Exception raised for errors in the input. Attributes: msg -- explanation of the error """ def __init__(self, msg, status): self.ERROR_RESPONSE = json.dumps({ 'message': msg,