def intersect_keys(keys, reffile, cache=False): """Extract SeqRecords from the index by matching keys.""" # Build/load the index of reference sequences index = None if cache: refcache = reffile + '.sqlite' if os.path.exists(refcache): if os.stat(refcache).st_mtime < os.stat(reffile).st_mtime: logging.warn("Outdated cache; rebuilding index") else: try: index = SeqIO.index_db(refcache) except Exception: logging.warn("Skipping corrupted cache; rebuilding index") index = None else: refcache = ':memory:' if index is None: # Rebuild the index, for whatever reason index = SeqIO.index_db(refcache, [reffile], 'fasta') # Extract records by key for key in keys: try: record = index[key] except LookupError: # Missing keys are rare, so it's faster not to check every time logging.info("No match: %s", repr(key)) continue yield record
def key_check(self, filename, format, alphabet, comp): """Check indexing with a key function.""" if comp: h = gzip_open(filename, format) id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)] h.close() else: id_list = [rec.id for rec in SeqIO.parse(filename, format, alphabet)] key_list = [add_prefix(id) for id in id_list] with warnings.catch_warnings(): if "_alt_index_" in filename: # BiopythonParserWarning: Could not parse the SFF index: # Unknown magic number b'.diy' in SFF index header: # b'.diy1.00' warnings.simplefilter("ignore", BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() del rec_dict if not sqlite3: return # In memory, rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) # check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy", key_function=add_prefix) self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"], key_function=add_prefix) rec_dict.close() del rec_dict # Saving to file... index_tmp = filename + ".key.idx" if os.path.isfile(index_tmp): os.remove(index_tmp) rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet, key_function=add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict os.remove(index_tmp)
def key_check(self, filename, format, alphabet, comp): """Check indexing with a key function.""" if comp: h = gzip_open(filename, format) id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)] h.close() else: id_list = [rec.id for rec in SeqIO.parse(filename, format, alphabet)] key_list = [add_prefix(id) for id in id_list] rec_dict = SeqIO.index(filename, format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() del rec_dict if not sqlite3: return #In memory, rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) #check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy", key_function=add_prefix) self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"], key_function=add_prefix) rec_dict.close() del rec_dict #Saving to file... index_tmp = filename + ".key.idx" if os.path.isfile(index_tmp): os.remove(index_tmp) rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict #Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict #Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet, key_function=add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict os.remove(index_tmp)
def simple_check(self, filename, format, alphabet): """Check indexing (without a key function).""" id_list = [rec.id for rec in SeqIO.parse(filename, format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict._proxy._handle.close() #TODO - Better solution del rec_dict if not sqlite3: return #In memory, #note here give filenames as list of strings rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict #check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy") self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"]) #Saving to file... index_tmp = filename + ".idx" if os.path.isfile(index_tmp): os.remove(index_tmp) #To disk, #note here we give the filename as a single string #to confirm that works too (convience feature). rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() #hack for PyPy del rec_dict #Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() #hack for PyPy del rec_dict #Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() #hack for PyPy del rec_dict os.remove(index_tmp)
def makeSQLindex(infiles=None, data_inpath='', mode='grouped', outname=None): ''' Creates an SQL index out of either an uncompressed file or a compressed .bgzf file if infiles is a string it is interpreted as a glob if infiles is list, goes through all file names in list. - mode - grouped: all files are indexed to a single index file, specified by outname ''' starting_dir = os.getcwd() if data_inpath: os.chdir(data_inpath) if outname is None: outname = 'reads.idx' if type(infiles) is str: # Fetch files by file types using glob import glob infiles = glob.glob(infiles) elif type(infiles) is not list and type(infiles) is not tuple: raise Exception("Invalid input files specified.") assert infiles, 'No files found, or no files passed.' # Handle multiple types of input for infiles if mode == 'grouped': idx_filename = outname tak = time.time() print 'Writing {0} files to SQL index ...'.format(len(infiles)) SeqIO.index_db(idx_filename, infiles , 'fastq') idx_t = time.time() - tak print 'Finished Indexing to {0}\n after {1}\n'.format(idx_filename, time.strftime('%H:%M:%S', time.gmtime(idx_t))) elif mode == 'separate': for filename in infiles: tak = time.time() print 'Writing SQL index file for {0} ...'.format(filename) idx_filename = filename.split('.')[0] + '.idx' SeqIO.index_db(idx_filename, filename , 'fastq') print '{0} written successfully'.format(idx_filename) idx_t = time.time() - tak print 'Finished Indexing after {1}\n'.format(time.strftime('%H:%M:%S', time.gmtime(idx_t))) if os.getcwd() != starting_dir: os.chdir(starting_dir)
def key_check(self, filename, format, alphabet): """Check indexing with a key function.""" if format in SeqIO._BinaryFormats: mode = "rb" else : mode = "r" id_list = [rec.id for rec in \ SeqIO.parse(open(filename, mode), format, alphabet)] key_list = [add_prefix(id) for id in id_list] rec_dict = SeqIO.index(filename, format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) if not sqlite3: return #In memory, rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) #check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy", key_function=add_prefix) self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"], key_function=add_prefix) #Saving to file... index_tmp = filename + ".key.idx" rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() del rec_dict #Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() del rec_dict #Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet, key_function=add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() del rec_dict os.remove(index_tmp)
def load(self, files, dbname=None): if isinstance(files, basestring): files = [files] self.close() valid = [] schemas = set() for filename in files: if not os.path.isfile(filename): print 'No such file: %s' % filename continue schema = SeqLoader.guess_schema(filename) if not schema: print 'Unable to guess schema from filename: %s' % filename continue schemas.add(schema) valid.append(filename) if len(schemas) != 1: raise ValueError('All files should be of the same type, but %d types found: %s' % (len(schemas), schemas)) if not valid: print 'No valid files provided.' return False if not dbname: self.dbname = mktmp_name('_SeqView.db') safe_unlink(self.dbname) self.tmp_db = True else: self.dbname = dbname self.db = SeqIO.index_db(self.dbname, valid, schemas.pop()) self._ids = tuple(sorted(self.db.keys())) self.master = True return bool(self)
def _unpickle_SeqView(dbname, ids, upper): v = SeqView(upper) v.dbname = dbname v.db = SeqIO.index_db(dbname) v.master = True v._ids = ids return v
def reload(self, dbname): self.close() self.dbname = dbname self.db = SeqIO.index_db(self.dbname) self._ids = sorted(self.db.keys()) self.tmp_db = False self.master = True
def extract_estseq(self): '''Function to extract and print the ESTSeq slice.''' self.logger.debug("Extracting the slice from ESTSeq.") estseqOut = os.path.join( self.dataFolder, "{0}.{1}.fa".format(self.new_name, self.estseq_alias)) if os.path.exists(estseqOut) and os.path.getsize(estseqOut)>0: self.logger.warn("ESTSeq file for alias {0} already present. Exiting".format( self.estseq_alias)) return index=SeqIO.index_db(self.estseq) if self.chrom not in index: self.logger.critical("Chromosome {0} not in ESTSeq. Exiting.".format(self.chrom)) self.failed=True return start = max(self.start-self.flank-1, 0) end = min(len(index[self.chrom])-1, self.end-1+self.flank) seq=index[self.chrom][start:end] with open(estseqOut,'w') as out: seq.id=self.new_name seq.description="" print(seq.format('fasta'), file=out) self.logger.debug("Finished extracting the CONSeq sequence.") return
def test_old_files_same_dir(self): """Load existing index with correct files (from same directory).""" os.chdir("Roche") d = SeqIO.index_db("triple_sff.idx", ["E3MFGYR02_no_manifest.sff", "greek.sff", "paired.sff"]) self.assertEqual(54, len(d)) self.assertEqual(395, len(d["alpha"]))
def filter_clusters2(cluster_filepath, idx_filepath, size_range, output_dirpath): """ Writes a subset of cluster sizes to FastQ files The representative sequence is the first sequence record written. make the sequence record instead of passing it. """ starting_dir = os.getcwd() idx_dir = os.path.split(idx_filepath)[0] # Check and create directory if not os.path.exists(output_dirpath): os.makedirs(output_dirpath) cluster_gen = parse(cluster_filepath, idx_filepath) seqrec_lookup = SeqIO.index_db(idx_filepath) size_counter = Counter() for cluster in cluster_gen: # Check if cluster size is within defined range if cluster.size > size_range[0] and cluster.size < size_range[1]: size_counter[cluster.size] += 1 # Get the sequence records for the cluster if os.getcwd() != idx_dir: os.chdir(idx_dir) # Representative sequence first seqrecord = seqrec_lookup[cluster.rep_seq_id] if os.getcwd() != output_dirpath: os.chdir(output_dirpath) # Write cluster to a file fname = "clustersize{0}-No{1}.fastq".format(str(cluster.size), str(size_counter[cluster.size])) if os.path.isfile(fname): output_handle = open(fname, "wb") output_handle.close() output_handle = open(fname, "a") SeqIO.write(seqrecord, output_handle, "fastq") for member in cluster.members_id : if os.getcwd() != idx_dir: os.chdir(idx_dir) # Representative sequence first seqrecord = seqrec_lookup[member] if os.getcwd() != output_dirpath: os.chdir(output_dirpath) # Write sequence record to file SeqIO.write(seqrecord, output_handle, "fastq") if os.getcwd() != starting_dir: os.chdir(starting_dir)
def test_order_index_db(self): """Check index_db preserves order in multiple indexed files.""" files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"] ids = [] for f in files: ids.extend(r.id for r in SeqIO.parse(f, "fasta")) d = SeqIO.index_db(":memory:", files, "fasta") self.assertEqual(ids, list(d))
def simple_check(self, filename, format, alphabet): """Check indexing (without a key function).""" if format in SeqIO._BinaryFormats: mode = "rb" else : mode = "r" id_list = [rec.id for rec in \ SeqIO.parse(open(filename, mode), format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) if not sqlite3: return #In memory, #note here give filenames as list of strings rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) #check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy") self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"]) #Saving to file... index_tmp = filename + ".idx" #To disk, #note here we give the filename as a single string #to confirm that works too (convience feature). rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict #Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict #Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict os.remove(index_tmp)
def indexer(index_name): '''Set up an index to search a large fasta file. We are going to try to use the SeqIO.index_db because it creates (and subsequently loads) a SQLite database on disk instead of a giant 2GB thing in memory.''' index=SeqIO.index_db(index_name, 'fasta') return index
def intersect_keys(keys, reffile, cache=False, clean_accs=False): """Extract SeqRecords from the index by matching keys. keys - an iterable of sequence identifiers/accessions to select reffile - name of a FASTA file to extract the specified sequences from cache - save an index of the reference FASTA sequence offsets to disk? clean_accs - strip HMMer extensions from sequence accessions? """ # Build/load the index of reference sequences index = None if cache: refcache = reffile + '.sqlite' if os.path.exists(refcache): if os.stat(refcache).st_mtime < os.stat(reffile).st_mtime: logging.warn("Outdated cache; rebuilding index") else: try: index = (SeqIO.index_db(refcache, key_function=clean_accession) if clean_accs else SeqIO.index_db(refcache)) except Exception: logging.warn("Skipping corrupted cache; rebuilding index") index = None else: refcache = ':memory:' if index is None: # Rebuild the index, for whatever reason index = (SeqIO.index_db(refcache, [reffile], 'fasta', key_function=clean_accession) if clean_accs else SeqIO.index_db(refcache, [reffile], 'fasta')) # Extract records by key if clean_accs: keys = (clean_accession(k) for k in keys) for key in keys: try: record = index[key] except LookupError: # Missing keys are rare, so it's faster not to check every time logging.info("No match: %s", repr(key)) continue yield record
def check(self, index_file, sff_files): if os.path.isfile(index_file): os.remove(index_file) # Build index... d = SeqIO.index_db(index_file, sff_files, "sff") self.assertEqual(395, len(d["alpha"])) d._con.close() # hack for PyPy d.close() self.assertEqual([os.path.abspath(f) for f in sff_files], [os.path.abspath(f) for f in d._filenames]) # Load index... d = SeqIO.index_db(index_file, sff_files) self.assertEqual(395, len(d["alpha"])) d._con.close() # hack for PyPy d.close() self.assertEqual([os.path.abspath(f) for f in sff_files], [os.path.abspath(f) for f in d._filenames]) os.remove(index_file)
def _init_storage(self, genome): """Load sequences from genome, their sizes and init links""" # load fasta into index self.sequences = SeqIO.index_db(genome+".db3", genome, 'fasta') self.seq = self.sequences # prepare storage self.contigs = {c: len(self.seq[c]) for c in self.seq} # this is very ineffective as it loads the record! self.links = {c: [{}, {}] for c in self.contigs} self.ilinks = 0
def find_and_open(self, input_directory): filename = self.get_filename(input_directory) if not os.path.isfile(filename): return False else: self.filename = filename self.index_filename = filename.replace('.fasta', '.idx') self.index = SeqIO.index_db(self.index_filename, self.filename, 'fasta', generic_protein) self._open = True return True
def main(args, loglevel): logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel) if args.seqfile[-3:] == '.gz': fh = gzip.open(args.seqfile, 'rt') else: fh = open(args.seqfile, 'r') for line in fh: if line[0] == '>': filetype = 'fasta' break elif line[0] == '@': filetype = 'fastq' break else: raise RuntimeError("Cannot guess file type for %s" % args.seqfile) fh.close() logging.info("Indexing {} file {}".format(filetype, args.seqfile)) record_dict = SeqIO.index_db(args.seqfile + '.idx', args.seqfile, filetype) logging.info("Reading filter file {}".format(args.filterfile)) with open(args.filterfile, 'r') as fh: id_count = sum(1 for line in fh) fh.seek(0) start_time = time.time() try: for i, line in enumerate(fh): if i % 1000 == 0: try: rate = i / (time.time()-start_time) time_remain = (id_count - i) / rate except ZeroDivisionError: rate = 0 time_remain = 0 logging.info("Processed {} of {} IDs ({:.2f} per second, est. complete at {})". format(i, id_count, rate, time.asctime(time.localtime(time.time() + time_remain)))) try: rec = record_dict[line.rstrip()] print(rec.format(filetype)) except KeyError: logging.debug('record id {} not found'.format(line.strip())) pass except IOError: try: sys.stdout.close() except IOError: pass try: sys.stderr.close() except IOError: pass
def __init__(self, guide_seq, all_seq_keys, length_threshold, evalue_threshold, gb_dir, num_cores): """ Input: name of FASTA file containing guide sequences, dictionary of all GenBank sequences, a list of ingroup/outgroup sequences, the e-value threshold to cluster, and the threshold of sequence length percent similarity to cluster taxa, and the GenBank directory. Generates a list of clusters (each cluster is itself a list of keys to sequences). """ ClusterBuilder.__init__(self, all_seq_keys) lock = multiprocessing.Lock() manager = multiprocessing.Manager() already_compared = manager.list() clusters = manager.list() color = Color() # check for fasta file of guide sequences if not os.path.isfile(guide_seq): print(color.red + "FASTA file of guide sequences not found. Please re-try." + color.done) sys.exit(0) else: # initialize an empty list for each cluster guide_sequences = SeqIO.parse(open(guide_seq, "rU"), "fasta") for guide in guide_sequences: clusters.append([]) # make blast database gb = SeqIO.index_db(gb_dir + "/gb.idx") output_handle = open('blast_db.fasta', 'w') records = [] for key in all_seq_keys: record = gb[key] records.append(record) SeqIO.write(records, output_handle, 'fasta') output_handle.close() # spawn processes print(color.blue + "Spawning " + color.red + str(num_cores) + color.blue + " processes to make clusters." + color.done) processes = [] for i in range(num_cores): p = multiprocessing.Process(target=self.make_guided_clusters_worker, args=(guide_seq, all_seq_keys, \ length_threshold, evalue_threshold, clusters, already_compared, lock, i, gb_dir)) p.start() processes.append(p) for p in processes: p.join() sys.stdout.write("\n") sys.stdout.flush() self.clusters = clusters if os.path.isfile("blast_db.fasta"): os.remove("blast_db.fasta")
def sqlite(path): """ Sets up the SQLite db for the GenBank division. Path is the absolute path of the GB files. Returns a dictionary of SeqRecord objects. """ color = Color() if os.path.exists(path + "/gb.idx"): print(color.purple + "Genbank database already downloaded. Indexing sequences..." + color.done) return SeqIO.index_db(path + "/gb.idx") else: files = os.listdir(path) path_files = [] if len(files) == 0: print(color.red + "GenBank files not found. Re-download with the -d option. See --help for more details." + color.done) sys.exit(0) for file in files: path_files.append(path + "/" + file) print(color.purple + "Genbank database already downloaded. Indexing sequences..." + color.done) return SeqIO.index_db(path + "/gb.idx", path_files, "genbank")
def fasta2homozygous(out, fasta, identity, overlap, minLength, \ libraries, limit, \ threads=1, joinOverlap=200, endTrimming=0, verbose=0): """Parse alignments and report homozygous contigs""" #create/load fasta index if verbose: sys.stderr.write("Indexing fasta...\n") faidx = SeqIO.index_db(fasta.name+".db3", fasta.name, "fasta") genomeSize = sum(len(faidx[c]) for c in faidx) # depth-of-coverage info c2cov, covTh = None, None if libraries: c2cov, covTh = get_coverage(faidx, fasta.name, libraries, limit, \ verbose) # run blat for identity >= 0.85 similarity, name = blat, "BLAT" # or run last for more diverged haplotypes if identity < 0.85: similarity, name = last_single, "LAST" # multi-threading on python 2.7+ only, as 2.6 stalls if threads > 1 and sys.version_info[0] == 2 \ and sys.version_info[1] > 6: similarity, name = last_multi, "multithreaded LAST" #run blat psl = fasta.name + ".psl.gz" if not os.path.isfile(psl): if verbose: sys.stderr.write("Running %s...\n"%name) similarity(fasta.name, identity, threads, verbose) if verbose: sys.stderr.write("Parsing alignments...\n") #filter alignments hits, overlapping = psl2hits(psl, identity, overlap, joinOverlap, endTrimming) #remove redundant ## maybe store info about removed also contig2skip, identity = hits2skip(hits, faidx, verbose) #print "\n".join("\t".join(map(str, x)) for x in overlapping[:100]); return #report homozygous fasta nsize, k, skipped, ssize, merged = merge_fasta(out, faidx, contig2skip, \ overlapping, minLength, verbose) #summary info = "%s\t%s\t%s\t%s\t%.2f\t%s\t%.2f\t%.3f\t%s\t%s\t%.2f\t%s\t%.2f\n" sys.stderr.write(info%(fasta.name, genomeSize, len(faidx), ssize, 100.0*ssize/genomeSize, \ skipped, 100.0*skipped/len(faidx), identity, len(merged), \ nsize, 100.0*nsize/genomeSize, k, 100.0*k/len(faidx))) return genomeSize, len(faidx), ssize, skipped, identity
def make_contigs_and_split_hints(input_filename, output_filenames, genome_filename): prefix = input_filename.replace('.sorted.hints', '') genome_filename = genome_filename.replace(prefix, '') index_filename = re.sub(FASTA_RE,'.idx', genome_filename) hints_file = open(input_filename) contig_list_filename = output_filenames[0].replace(prefix, '') contig_list_output_file = open(contig_list_filename,'w') genome_dict = SeqIO.index_db(index_filename) current_contig = None contigs_seen = set() for line in hints_file: if line.startswith('#'): continue fields = line.split('\t') assert len(fields) == 9, 'invalid hints format, expected 9 fields, got this line: {}'.format(line) contig_name = fields[0] if contig_name != current_contig: contigs_seen.add(contig_name) if contig_name.startswith('@unitig'): match = unitig_re.match(contig_name) output_prefix = 'unitig_' + match.group(1) else: output_prefix = contig_name hints_output_file = open(output_prefix + HINTS_SUFFIX,'w') contig_output_file = open(output_prefix + CONTIG_SUFFIX, 'w') contig_seq = genome_dict[contig_name] SeqIO.write(contig_seq, contig_output_file, 'fasta') contig_output_file.close() contig_list_output_file.write('\t'.join([contig_name, output_prefix + HINTS_SUFFIX, output_prefix + CONTIG_SUFFIX]) + '\n') current_contig = contig_name hints_output_file.write(line) hints_output_file.close() # write out all the contigs for which we have no hints, # along with blank hints files for contig_name in genome_dict.keys(): if not contig_name in contigs_seen: #TODO: make this into a function, we're re-using code here if contig_name.startswith('@unitig'): match = unitig_re.match(contig_name) output_prefix = 'unitig_' + match.group(1) else: output_prefix = contig_name hints_output_file = open(output_prefix + HINTS_SUFFIX,'w') hints_output_file.close() # write a blank hints file contig_output_file = open(output_prefix + CONTIG_SUFFIX, 'w') contig_seq = genome_dict[contig_name] SeqIO.write(contig_seq, contig_output_file, 'fasta') contig_output_file.close() contig_list_output_file.write('\t'.join([contig_name, output_prefix + HINTS_SUFFIX, output_prefix + CONTIG_SUFFIX]) + '\n') contig_list_output_file.close()
def fastq2random(outbase, files, n, verbose, seqformat='fastq'): """Return number of random reads from FastQ file(s)""" #generate indexes if verbose: sys.stderr.write("Generating indexes...\n") indexes = [] for i, f in enumerate(files, 1): sys.stderr.write(" %s \r"%i) c = Counter() #counters.append(Counter()) index = SeqIO.index_db(f.name+'.idx', f.name, seqformat, key_function=c.count) index.close() #get random entries store_random_entries(outbase, files, n, verbose)
def check(self, index_file, sff_files, expt_sff_files): if os.path.isfile(index_file): os.remove(index_file) # Build index... d = SeqIO.index_db(index_file, sff_files, "sff") self.assertEqual(395, len(d["alpha"])) d._con.close() # hack for PyPy d.close() self.assertEqual([os.path.abspath(f) for f in sff_files], [os.path.abspath(f) for f in d._filenames]) # Now directly check the filenames inside the SQLite index: filenames, flag = raw_filenames(index_file) self.assertEqual(flag, True) self.assertEqual(filenames, expt_sff_files) # Load index... d = SeqIO.index_db(index_file, sff_files) self.assertEqual(395, len(d["alpha"])) d._con.close() # hack for PyPy d.close() self.assertEqual([os.path.abspath(f) for f in sff_files], d._filenames) os.remove(index_file)
def get_seq(args, seqname, start = 1 , end = None, strand = 1): """builds a biopython database file for the subject sequences and retrieves the specified portion of the specified sequence, reverse-complementing if necessary and translating dna sequences.""" sequence_db = SeqIO.index_db(args.index_filename, args.seqfilename, 'fasta') #seq = sequence_db[seqname][start-1:end].seq seq = sequence_db[seqname][start-1:end].seq if strand < 0: seq = seq.reverse_complement() if args.translate and args.program == 'tblastn': while len(seq) % 3: seq = seq[:-1] seq = seq.translate() return seq
def main(argv): blastfilename = '' seqfilename = '' try: opts, args = getopt.getopt(argv,"hb:s:",["seqfile=","blastfile="]) except getopt.GetoptError: print 'Type Blast2OrthologGroups.py -h for options' sys.exit(2) for opt, arg in opts: if opt == "-h": print 'Blast2OrthologGroups.py -b <blastfile> -s <seqfile>' sys.exit() elif opt in ("-b", "--blastfile"): blastfilename = arg elif opt in ("-s", "--seqfile"): seqfilename = arg #Exclude sequences that cluster with chloroplast-encoded genes chloroplast_clusters = ("Cluster_10175", "Cluster_10226", "Cluster_10232", "Cluster_10485", "Cluster_10984", "Cluster_11190", "Cluster_11826", "Cluster_11977", "Cluster_11980", "Cluster_12324", "Cluster_12326", "Cluster_12658", "Cluster_12697", "Cluster_1272", "Cluster_12984", "Cluster_13661", "Cluster_13666", "Cluster_13836", "Cluster_14312", "Cluster_14375", "Cluster_14565", "Cluster_1826", "Cluster_2867", "Cluster_29556", "Cluster_4023", "Cluster_4421", "Cluster_5258", "Cluster_5739", "Cluster_5972", "Cluster_615", "Cluster_7013", "Cluster_7860", "Cluster_8325", "Cluster_8384") cluster_info = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "RefSeq", "SeqClusters.p") seq_groups = pickle.load( open( cluster_info, "rb" ) ) indexfilename = seqfilename + ".inx" new_seqs = SeqIO.index_db(indexfilename, seqfilename, "fasta") used_seqs = {} with open(blastfilename, 'rU') as f: reader=csv.reader(f,delimiter='\t') for row in reader: qseqid, qlen, sacc, slen, pident, length, mismatch, gapopen, qstart, qend, qframe, sstart, send, sframe, evalue, bitscore = row if sacc in seq_groups and not used_seqs.has_key(qseqid) and not seq_groups[sacc] in chloroplast_clusters: used_seqs[qseqid] = 1 seq = new_seqs[qseqid] #cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "RefSeq", seq_groups[sacc] + ".fa") cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "ContigClusters", seq_groups[sacc] + ".fa") cluster_file = open(cluster_filename, "a") #print "saving %s to %s" % (qseqid, cluster_filename) id = seq.id if int(send) < int(sstart): seq = seq.reverse_complement() id = id + '_rc' seq.id, seq.description = id, id cluster_file.write(seq.format("fasta")) cluster_file.close()
def extract_conseq(self): '''Method to extract the CONSeq slice from a CONSeq genome file.''' self.logger.debug("Extracting the slice from CONSeq.") conseqOut = os.path.join( self.dataFolder, "{0}.{1}.conseq".format(self.new_name, self.conseq_alias)) if os.path.exists(conseqOut) and os.path.getsize(conseqOut)>0: self.logger.warn("CONSeq file for alias {0} already present. Exiting".format(self.conseq_alias)) return try: index = SeqIO.index_db(self.conseq) except Exception as error: self.logger.exception(error) self.logger.error("Conseq is: {0}".format(self.conseq)) self.failed=True return if self.chrom not in index: self.logger.critical("Chromosome {0} not in Conseq file. Exiting.".format(self.chrom)) self.failed=True return #Establish boundaries start = max(self.start-self.flank-1, 0) end = min(len(index[self.chrom])-1, self.end-1+self.flank) seq=index[self.chrom][start:end] with open(conseqOut,'w') as out: print(seq.seq, file=out) self.logger.debug("Finished extracting the CONSeq sequence.") return
def distance_matrix_worker(self, seq_keys, length_threshold, dist_matrix, already_compared, lock, process_num, gb_dir): """ Worker process for make_distance_matrix(). Takes a list "already_compared" of sequences that have already had all pairwise comparisons. Each worker process will work making pairwise comparisons for a different sequence, adding them to the "already_compared" list as they are completed. """ # each process must load its own sqlite gb gb = SeqIO.index_db(gb_dir + "/gb.idx") process_num = str(process_num) i = 0 color = Color() for key in seq_keys: # check whether another process is already comparing this row compare_row = False with lock: if key not in already_compared: already_compared.append(key) compare_row = True if compare_row: # make the blast query record1 = gb[key] output_handle = open('query' + process_num + '.fasta', 'w') SeqIO.write(record1, output_handle, 'fasta') output_handle.close() # make blast database j = 0 output_handle = open('blast_db' + process_num + '.fasta', 'w') records = [] for key2 in seq_keys: # only add sequences that have not yet been compared if j > i: record = gb[key2] records.append(record) if j == i: row = dist_matrix[i] row[j] = 0.0 dist_matrix[i] = row j += 1 SeqIO.write(records, output_handle, 'fasta') output_handle.close() if len(records) > 0: # blast query against blast_db blastn_cmd = NcbiblastnCommandline(query='query' + process_num + '.fasta', subject='blast_db' + process_num + '.fasta', \ out='blast' + process_num + '.xml', outfmt=5) stdout, stderr = blastn_cmd() # parse blast output j = i + 1 blastn_xml = open('blast' + process_num + '.xml', 'r') blast_records = NCBIXML.parse(blastn_xml) for blast_record in blast_records: for alignment in blast_record.alignments: # loop through each high-scoring segment pair (HSP) for hsp in alignment.hsps: length1 = len(record1.seq) length2 = alignment.length # first check if length similarity threshold met if (length1 < length2 * (1 + float(length_threshold))) and (length1 > length2 * (1 - float(length_threshold))): # blast hit found, set distance to e-value row = dist_matrix[i] row[j] = hsp.expect dist_matrix[i] = row row = dist_matrix[j] row[i] = hsp.expect dist_matrix[j] = row else: # set distance to 50.0 if length similarity threshold not met row = dist_matrix[i] row[j] = 50.0 dist_matrix[i] = row row = dist_matrix[j] row[i] = 50.0 dist_matrix[j] = row j += 1 blastn_xml.close() i += 1 # update status percent = str(round(100 * len(already_compared)/float(len(seq_keys)), 2)) sys.stdout.write('\r' + color.blue + 'Completed: ' + color.red + str(len(already_compared)) + '/' + str(len(seq_keys)) + ' (' + percent + '%)' + color.done) sys.stdout.flush() # done looping through all keys, now clean up if os.path.exists('blast_db' + process_num + '.fasta'): os.remove('blast_db' + process_num + '.fasta') if os.path.exists("blast" + process_num + ".xml"): os.remove("blast" + process_num + ".xml") if os.path.exists("query" + process_num + ".fasta"): os.remove("query" + process_num + ".fasta") if os.path.exists("subject" + process_num + ".fasta"): os.remove("subject" + process_num + ".fasta")
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = self.check_path(output_dir) for species in clusters_dict: idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (self.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = SeqIO.index_db( idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_super_dict[species], seqeuence_names[species]), out_file, format=sequence_file_format) elif mode == "families": def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy( seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) SeqIO.write(per_family_record_generator( sequence_super_dict, clusters_dict, cluster_name), out_file, format=sequence_file_format) for species in clusters_dict: os.remove("%s_tmp.idx" % species)
def test_old_same_dir(self): """Load existing index with no options (from same directory).""" os.chdir("Roche") d = SeqIO.index_db("triple_sff.idx") self.assertEqual(54, len(d)) self.assertEqual(395, len(d["alpha"]))
def key_check(self, filename, format, alphabet, comp): """Check indexing with a key function.""" if comp: h = gzip_open(filename, format) id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)] h.close() else: id_list = [ rec.id for rec in SeqIO.parse(filename, format, alphabet) ] key_list = [add_prefix(id) for id in id_list] rec_dict = SeqIO.index(filename, format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() del rec_dict if not sqlite3: return #In memory, rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) #check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy", key_function=add_prefix) self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"], key_function=add_prefix) rec_dict.close() del rec_dict #Saving to file... index_tmp = filename + ".key.idx" if os.path.isfile(index_tmp): os.remove(index_tmp) rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict #Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict #Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet, key_function=add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict os.remove(index_tmp)
def simple_check(self, filename, format, alphabet, comp): """Check indexing (without a key function).""" if comp: h = gzip_open(filename, format) id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)] h.close() else: id_list = [ rec.id for rec in SeqIO.parse(filename, format, alphabet) ] with warnings.catch_warnings(): if "_alt_index_" in filename: # BiopythonParserWarning: Could not parse the SFF index: # Unknown magic number b'.diy' in SFF index header: # b'.diy1.00' warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict if not sqlite3: return # In memory, # note here give filenames as list of strings rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict # check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy") self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"]) # Saving to file... index_tmp = self.index_tmp if os.path.isfile(index_tmp): os.remove(index_tmp) # To disk, # note here we give the filename as a single string # to confirm that works too (convience feature). rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload without passing filenames and format # and switch directory to check paths still work index_tmp = os.path.abspath(index_tmp) os.chdir(os.path.dirname(filename)) rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict os.remove(index_tmp)
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [ rec.id.lower() for rec in SeqIO.parse(h, format, alphabet) ] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [ rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet) ] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) if sqlite3: rec_dict_db = SeqIO.index_db( ":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) if sqlite3: rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) self.assertEqual(set(id_list), set(rec_dict)) if sqlite3: self.assertEqual(set(id_list), set(rec_dict_db)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertIn(key, rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(isinstance(raw, bytes), "Didn't get bytes from %s get_raw" % format) self.assertTrue(raw.strip()) self.assertIn(raw, raw_file) if sqlite3: raw_db = rec_dict_db.get_raw(key) # Via index using format-specific get_raw which scans the file, # Via index_db in general using raw length found when indexing. self.assertEqual( raw, raw_db, "index and index_db .get_raw() different for %s" % format) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(b"<entry ")) self.assertTrue(raw.endswith(b"</entry>")) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
def simple_check(self, filename, fmt, alphabet, comp): """Check indexing (without a key function).""" msg = "Test failure parsing file %s with format %s" % (filename, fmt) if comp: mode = "r" + self.get_mode(fmt) with gzip.open(filename, mode) as handle: id_list = [rec.id for rec in SeqIO.parse(handle, fmt, alphabet)] else: id_list = [rec.id for rec in SeqIO.parse(filename, fmt, alphabet)] with warnings.catch_warnings(): if "_alt_index_" in filename: # BiopythonParserWarning: Could not parse the SFF index: # Unknown magic number b'.diy' in SFF index header: # b'.diy1.00' warnings.simplefilter("ignore", BiopythonParserWarning) rec_dict = SeqIO.index(filename, fmt, alphabet) self.check_dict_methods(rec_dict, id_list, id_list, msg=msg) rec_dict.close() if not sqlite3: return # In memory, # note here give filenames as list of strings rec_dict = SeqIO.index_db(":memory:", [filename], fmt, alphabet) self.check_dict_methods(rec_dict, id_list, id_list, msg=msg) rec_dict.close() # check error conditions with self.assertRaises(ValueError, msg=msg): SeqIO.index_db(":memory:", format="dummy") with self.assertRaises(ValueError, msg=msg): SeqIO.index_db(":memory:", filenames=["dummy"]) # Saving to file... index_tmp = self.index_tmp if os.path.isfile(index_tmp): os.remove(index_tmp) # To disk, # note here we give the filename as a single string # to confirm that works too. rec_dict = SeqIO.index_db(index_tmp, filename, fmt, alphabet) self.check_dict_methods(rec_dict, id_list, id_list, msg=msg) rec_dict.close() rec_dict._con.close() # hack for PyPy # Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], fmt, alphabet) self.check_dict_methods(rec_dict, id_list, id_list, msg=msg) rec_dict.close() rec_dict._con.close() # hack for PyPy # Now reload without passing filenames and format # and switch directory to check paths still work index_tmp = os.path.abspath(index_tmp) os.chdir(os.path.dirname(filename)) try: rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet) finally: os.chdir(CUR_DIR) self.check_dict_methods(rec_dict, id_list, id_list, msg=msg) rec_dict.close() rec_dict._con.close() # hack for PyPy os.remove(index_tmp)
def key_check(self, filename, fmt, alphabet, comp): """Check indexing with a key function.""" msg = "Test failure parsing file %s with format %s" % (filename, fmt) if comp: mode = "r" + self.get_mode(fmt) with gzip.open(filename, mode) as handle: id_list = [rec.id for rec in SeqIO.parse(handle, fmt, alphabet)] else: id_list = [rec.id for rec in SeqIO.parse(filename, fmt, alphabet)] key_list = [self.add_prefix(id) for id in id_list] with warnings.catch_warnings(): if "_alt_index_" in filename: # BiopythonParserWarning: Could not parse the SFF index: # Unknown magic number b'.diy' in SFF index header: # b'.diy1.00' warnings.simplefilter("ignore", BiopythonParserWarning) rec_dict = SeqIO.index(filename, fmt, alphabet, self.add_prefix) self.check_dict_methods(rec_dict, key_list, id_list, msg=msg) rec_dict.close() if not sqlite3: return # In memory, rec_dict = SeqIO.index_db( ":memory:", [filename], fmt, alphabet, self.add_prefix ) self.check_dict_methods(rec_dict, key_list, id_list, msg=msg) # check error conditions with self.assertRaises(ValueError, msg=msg): SeqIO.index_db(":memory:", format="dummy", key_function=self.add_prefix) with self.assertRaises(ValueError, msg=msg): SeqIO.index_db( ":memory:", filenames=["dummy"], key_function=self.add_prefix ) rec_dict.close() # Saving to file... index_tmp = filename + ".key.idx" if os.path.isfile(index_tmp): os.remove(index_tmp) rec_dict = SeqIO.index_db( index_tmp, [filename], fmt, alphabet, self.add_prefix ) self.check_dict_methods(rec_dict, key_list, id_list, msg=msg) rec_dict.close() rec_dict._con.close() # hack for PyPy # Now reload it... rec_dict = SeqIO.index_db( index_tmp, [filename], fmt, alphabet, self.add_prefix ) self.check_dict_methods(rec_dict, key_list, id_list, msg=msg) rec_dict.close() rec_dict._con.close() # hack for PyPy # Now reload without passing filenames and format rec_dict = SeqIO.index_db( index_tmp, alphabet=alphabet, key_function=self.add_prefix ) self.check_dict_methods(rec_dict, key_list, id_list, msg=msg) rec_dict.close() rec_dict._con.close() # hack for PyPy os.remove(index_tmp)
import sys import tempfile # set up log logging.basicConfig(level=logging.DEBUG) # debug biopython issue logging.debug('sys.version') logging.debug(sys.version) logging.debug('sqlite3.version') logging.debug(sqlite3.version) logging.debug('platform.python_implementation()') logging.debug(platform.python_implementation()) logging.debug('platform.platform()') logging.debug(platform.platform()) logging.debug('Bio.__version__') logging.debug(Bio.__version__) logging.debug('os.environ') logging.debug(os.environ) read_file = '/r1.fq' outdir = tempfile.mkdtemp(dir=os.environ.get('PWD')) db_file = os.path.join(outdir, 'r1.idx') try: read_index = SeqIO.index_db(db_file, read_file, 'fastq') except Exception as e: logging.exception('') raise e
def gfftosequence(gff, sequencedir): seqdirectory = os.path.abspath(sequencedir) seqfiles = [ os.path.join(seqdirectory, file) for file in os.listdir(seqdirectory) ] #Make index of fasta files print 'Indexing sequences...' seqdb = SeqIO.index_db('seqdb.idx', seqfiles, 'fasta') print '{0} sequences indexed'.format(len(seqdb)) #Make gff database gff_fn = gff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn) db = gffutils.FeatureDB(db_fn) seqs = {} #dictionary where key is ID and value is sequence counter = 0 UTRs = db.features_of_type('5UTR') for UTR in UTRs: counter += 1 chrm = UTR.chrom strand = UTR.strand ID = UTR.attributes['ID'] exoncoords = [] UTRsequence = '' for exon in db.children(UTR, featuretype='exon'): exoncoords.append([exon.start, exon.stop]) number_of_exons = len(exoncoords) if strand == '+': for idx, exonstartstop in enumerate(exoncoords): if idx + 1 < number_of_exons: UTRsequence += seqdb[chrm].seq[exonstartstop[0] - 1:exonstartstop[1]].upper() elif idx + 1 == number_of_exons: #if at the last exon #To check for start codon, take three more nucleotides (change [1]-1 to [1]+2) UTRsequence += seqdb[chrm].seq[exonstartstop[0] - 1:exonstartstop[1] - 1].upper() elif strand == '-': for idx, exonstartstop in enumerate(reversed( exoncoords)): #reverse exon order since this is - strand if idx + 1 < number_of_exons: UTRsequence += seqdb[chrm].seq[exonstartstop[0] - 1:exonstartstop[1]].upper( ).reverse_complement() elif idx + 1 == number_of_exons: #if at the last exon #To check for start codon, take three more nucleotides (change [0] to [0]-3) UTRsequence += seqdb[chrm].seq[ exonstartstop[0]:exonstartstop[1]].upper( ).reverse_complement() seqs[ID] = UTRsequence if counter <= 50 and counter % 10 == 0: print 'Retrieving sequence %i' % (counter) elif counter > 50 and counter % 50 == 0: print 'Retrieving sequence %i' % (counter) print 'Retrieved {0} sequences.'.format(len(seqs)) os.remove(db_fn) os.remove('seqdb.idx') return seqs
def go(input, output, raw_reads, linear_refs, circular_refs, coverage_file): if raw_reads: assert os.path.isfile(raw_reads) idx = raw_reads + ".idx" if os.path.isfile(idx): sys.stderr.write("Loading %s\n" % idx) raw = SeqIO.index_db(idx) else: sys.stderr.write("Creating %s\n" % idx) raw = SeqIO.index_db(idx, raw_reads, "fastq") sys.stderr.write( "Have %i raw reads (used for unmapped partners)\n" % len(raw)) else: raw = dict() ref_len_linear = dict() if linear_refs: for f in linear_refs: ref_len_linear.update(get_fasta_ids_and_lengths(f)) ref_len_circles = dict() if circular_refs: for f in circular_refs: ref_len_circles.update(get_fasta_ids_and_lengths(f)) # print ref_len_circles if input is None: input_handle = sys.stdin elif isinstance(input, basestring): input_handle = open(input) else: input_handle = input if output is None: output_handle = sys.stdout elif isinstance(output, basestring): output_handle = open(output, "w") else: output_handle = output line = input_handle.readline() while line[0] == "@": # SAM header if line[0:4] == "@SQ\t": parts = line[4:].strip().split("\t") rname = None length = None for p in parts: if p.startswith("SN:"): rname = p[3:] if p.startswith("LN:"): length = int(p[3:]) if rname in ref_len_linear: assert length == ref_len_linear[rname] # print "Found @SQ line for linear reference %s" % rname elif rname in ref_len_circles: if length == 2 * ref_len_circles[rname]: # Return the length to its correct value (should have # happened already) sys.stderr.write("Fixing @SQ line for %s, length %i --> %i\n" % (rname, length, ref_len_circles[rname])) line = "@SQ\tSN:%s\tLN:%i\n" % ( rname, ref_len_circles[rname]) else: assert length == ref_len_circles[rname] elif rname is None: sys_exit("Bad @SQ line:\n%s" % line) else: sys_exit("This reference was not given!:\n%s" % line) output_handle.write(line) line = input_handle.readline() global solo0, solo1, solo2, solo12 solo0 = solo1 = solo2 = solo12 = 0 global coverage coverage = dict() if coverage_file: import numpy for lengths in [ref_len_linear, ref_len_circles]: for ref, length in lengths.iteritems(): coverage[ref] = numpy.zeros((5, length), numpy.float) cur_read_name = None reads = set() while line: # SAM read qname, flag, rname, pos, rest = line.split("\t", 4) flag = int(flag) if " " in qname: # Stupid mrfast! qname = qname.split(None, 1)[0] if rname in ref_len_circles and pos != "0": length = ref_len_circles[rname] if length <= int(pos) - 1: sys_exit("Have POS %s yet length of %s is %i (circular)\n" % (pos, rname, length)) elif rname in ref_len_linear and pos != "0": length = ref_len_linear[rname] if length <= int(pos) - 1: sys_exit("Have POS %s yet length of %s is %i (linear)\n" % (pos, rname, length)) if qname[-2:] == "/1": qname = qname[:-2] frag = 1 elif qname[-2:] == "/2": qname = qname[:-2] frag = 2 elif not (flag & 0x1): frag = 0 # Single read elif flag & 0x40: frag = 1 elif flag & 0x80: frag = 2 else: frag = 0 # Assume unpaired if qname == cur_read_name: # Cache this, as a tuple - ordered to allow sorting on position: # Using a set will eliminate duplicates after adjusting POS reads.add((qname, frag, rname, pos, flag, rest)) else: if coverage_file: count_coverage(coverage, reads) flush_cache(output_handle, reads, raw, ref_len_linear, ref_len_circles) reads = set([(qname, frag, rname, pos, flag, rest)]) cur_read_name = qname # Next line... line = input_handle.readline() if reads: if coverage_file: count_coverage(coverage, reads) flush_cache(output_handle, reads, raw, ref_len_linear, ref_len_circles) if isinstance(input, basestring): input_handle.close() if isinstance(output, basestring): output_handle.close() if coverage_file: handle = open(coverage_file, "w") for lengths in [ref_len_linear, ref_len_circles]: for ref, length in lengths.iteritems(): handle.write(">%s length %i\n" % (ref, length)) for row in coverage[ref]: assert len(row) == length handle.write("\t".join("%.1f" % v for v in row) + "\n") handle.close() sys.stderr.write("%i singletons; %i where only /1, %i where only /2, %i where both present\n" % (solo0, solo1, solo2, solo12))
#!/usr/bin/env python import os from Bio import SeqIO from RouToolPa.Routines import FileRoutines workdir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/" data_dir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/data/" os.chdir(workdir) data_files = FileRoutines.make_list_of_path_to_files([data_dir]) record_dict = SeqIO.index_db("tmp.idx", data_files, format="genbank") print("#organism\ttaxonomy\tregion_id\ttranscript_id\tproduct\texon_len") for record_id in record_dict: for feature in record_dict[record_id].features: if feature.type == "mRNA": mRNA_string = "" mRNA_string += "%s" % record_dict[record_id].annotations["organism"] mRNA_string += "\t%s" % (";".join( record_dict[record_id].annotations["taxonomy"])) mRNA_string += "\t%s" % record_id mRNA_string += "\t%s" % (feature.qualifiers["transcript_id"][0] if "transcript_id" in feature.qualifiers else ".") mRNA_string += "\t%s" % (feature.qualifiers["product"][0] if "product" in feature.qualifiers else ".") location_lenths = []
#!/usr/bin/env python import numpy as np import sys from Bio import SeqIO paf_file = sys.argv[1] unitigs = sys.argv[2] output_stats = sys.argv[3] output_file = sys.argv[4] unitigs_corrected = open(output_file, 'w') stats_file = open(output_stats, 'a') # open up sequence map for illumina illumina_dict = SeqIO.index_db(unitigs + ".idx", unitigs, "fasta") def find_sequence_illumina(seqid, start, end): return illumina_dict[seqid].seq[start:end + 1] def write_sequence(f, seq): for i in range(0, len(seq), 60): f.write(str(seq[i:i + 60]) + "\n") ############ read input paf file ####################################################################################### mapping_freq = {} nano_lengths = {}
def test_old_files(self): """Load existing index with correct files.""" d = SeqIO.index_db( "Roche/triple_sff.idx", ["E3MFGYR02_no_manifest.sff", "greek.sff", "paired.sff"]) self.assertEqual(54, len(d))
def get_raw_check(self, filename, fmt, alphabet, comp): # Also checking the key_function here msg = "Test failure parsing file %s with format %s" % (filename, fmt) if comp: with gzip.open(filename, "rb") as handle: raw_file = handle.read() mode = "r" + self.get_mode(fmt) with gzip.open(filename, mode) as handle: id_list = [rec.id.lower() for rec in SeqIO.parse(handle, fmt, alphabet)] else: with open(filename, "rb") as handle: raw_file = handle.read() id_list = [rec.id.lower() for rec in SeqIO.parse(filename, fmt, alphabet)] if fmt in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonParserWarning) rec_dict = SeqIO.index(filename, fmt, alphabet, key_function=str.lower) if sqlite3: rec_dict_db = SeqIO.index_db( ":memory:", filename, fmt, alphabet, key_function=str.lower, ) else: rec_dict = SeqIO.index(filename, fmt, alphabet, key_function=str.lower) if sqlite3: rec_dict_db = SeqIO.index_db( ":memory:", filename, fmt, alphabet, key_function=str.lower, ) self.assertEqual(set(id_list), set(rec_dict), msg=msg) if sqlite3: self.assertEqual(set(id_list), set(rec_dict_db), msg=msg) self.assertEqual(len(id_list), len(rec_dict), msg=msg) for key in id_list: self.assertIn(key, rec_dict, msg=msg) self.assertEqual(key, rec_dict[key].id.lower(), msg=msg) self.assertEqual(key, rec_dict.get(key).id.lower(), msg=msg) raw = rec_dict.get_raw(key) self.assertIsInstance(raw, bytes, msg=msg) self.assertTrue(raw.strip(), msg=msg) self.assertIn(raw, raw_file, msg=msg) if sqlite3: raw_db = rec_dict_db.get_raw(key) # Via index using format-specific get_raw which scans the file, # Via index_db in general using raw length found when indexing. self.assertEqual(raw, raw_db, msg=msg) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. mode = self.get_mode(fmt) if mode == "b": handle = BytesIO(raw) elif mode == "t": handle = StringIO(raw.decode()) else: raise RuntimeError("Unexpected mode %s" % mode) if fmt == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False, ) elif fmt == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True, ) elif fmt == "uniprot-xml": self.assertTrue(raw.startswith(b"<entry "), msg=msg) self.assertTrue(raw.endswith(b"</entry>"), msg=msg) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = ( """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % raw.decode() ) handle = StringIO(raw) rec2 = SeqIO.read(handle, fmt, alphabet) else: rec2 = SeqIO.read(handle, fmt, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
def test_old_rel(self): """Load existing index (with relative paths) with no options (from parent directory).""" d = SeqIO.index_db("Roche/triple_sff_rel_paths.idx") self.assertEqual(54, len(d)) self.assertEqual(395, len(d["alpha"]))
def test_order_index_db(self): """Check index_db preserves ordering indexed file.""" d = SeqIO.index_db(":memory:", [self.f], "fasta") self.assertEqual(self.ids, list(d))
def key_check(self, filename, format, alphabet, comp): """Check indexing with a key function.""" if comp: h = gzip_open(filename, format) id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)] h.close() else: id_list = [ rec.id for rec in SeqIO.parse(filename, format, alphabet) ] key_list = [add_prefix(id) for id in id_list] with warnings.catch_warnings(): if "_alt_index_" in filename: # BiopythonParserWarning: Could not parse the SFF index: # Unknown magic number b'.diy' in SFF index header: # b'.diy1.00' warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() del rec_dict if not sqlite3: return # In memory, rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) # check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy", key_function=add_prefix) self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"], key_function=add_prefix) rec_dict.close() del rec_dict # Saving to file... index_tmp = filename + ".key.idx" if os.path.isfile(index_tmp): os.remove(index_tmp) rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict # Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet, key_function=add_prefix) self.check_dict_methods(rec_dict, key_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict os.remove(index_tmp)
def make_index(input_file, output_file): SeqIO.index_db(output_file, input_file, 'fasta')
def index_gb(): print "indexing genbank file" gb_index = SeqIO.index_db(index_file, gb_file, "genbank") print "done indexing file\n" return gb_index
### Set up the logger # create logger with 'spam_application' logger = logging.getLogger('BuildBioIndex') logger.setLevel(logging.DEBUG) # create file handler which logs even debug messages # create console handler with a higher log level ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) #WARN # create formatter and add it to the handlers formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) # add the handlers to the logger logger.addHandler(ch) logger.debug(" ".join(sys.argv)) if len(sys.argv) != 3: logger.error("2 arguments are required (index, file fasta file") sys.exit(1) index_file = sys.argv[1] fasta_file = sys.argv[2] logger.debug("Index file: %s", index_file) logger.debug("Fasta file: %s", fasta_file) if os.path.isfile(fasta_file): IndexDB = SeqIO.index_db(index_file, fasta_file, "fasta") else: logger.error("Fasta file (%s) is not a file", fasta_file) sys.exit(1)
def simple_check(self, filename, format, alphabet, comp): """Check indexing (without a key function).""" if comp: h = gzip_open(filename, format) id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)] h.close() else: id_list = [ rec.id for rec in SeqIO.parse(filename, format, alphabet) ] rec_dict = SeqIO.index(filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict if not sqlite3: return #In memory, #note here give filenames as list of strings rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() del rec_dict #check error conditions self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy") self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"]) #Saving to file... index_tmp = self.index_tmp if os.path.isfile(index_tmp): os.remove(index_tmp) #To disk, #note here we give the filename as a single string #to confirm that works too (convience feature). rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict #Now reload it... rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict #Now reload without passing filenames and format rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet) self.check_dict_methods(rec_dict, id_list, id_list) rec_dict.close() rec_dict._con.close() # hack for PyPy del rec_dict os.remove(index_tmp)
type=FileRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of input files/directories with sequences") parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path, help="Directory to output groups_of sequences") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-e", "--extension", action="store", dest="extension", help="Extension of output files. Default: equal to -f") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with groups of sequences to extract(.fam file).") args = parser.parse_args() FileRoutines.safe_mkdir(args.output) args.extension = args.extension if args.extension else args.format tmp_index_file = "temp.idx" #id_list = read_ids(args.id_file) id_list = IdSet(filename=args.id_file) sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group], verbose=True), "%s%s.%s" % (args.output, group, args.extension), format=args.format) os.remove(tmp_index_file)
def test_old(self): """Load existing index with no options (from parent directory).""" d = SeqIO.index_db("Roche/triple_sff.idx") self.assertEqual(54, len(d)) self.assertRaises(FileNotFoundError, d.get_raw, "alpha")
"--dont_extract_kmer_list", action="store_true", dest="dont_extract_kmer_list", help="Don't extract kmer list") args = parser.parse_args() args.input = make_list_of_path_to_files(args.input) if args.count_both_strands and args.add_rev_com: raise ValueError( "Options -b/--count_both_strands and -r/--add_reverse_complement are not compatible" ) if args.add_rev_com: file_with_rev_com = args.base_prefix + "_with_rev_com.fasta" record_dict = SeqIO.index_db("temp_index.idx", args.input, format="fasta") SeqIO.write(rev_com_generator(record_dict, yield_original_record=True), file_with_rev_com, "fasta") args.base_prefix += "_with_rev_com" base_file = "%s_%i_mer.jf" % (args.base_prefix, args.kmer_length) kmer_table_file = "%s_%i_mer.counts" % (args.base_prefix, args.kmer_length) kmer_file = "%s_%i_mer.kmer" % (args.base_prefix, args.kmer_length) Jellyfish.threads = args.threads Jellyfish.path = args.jellyfish_path if args.jellyfish_path else "" Jellyfish.count(args.input if not args.add_rev_com else file_with_rev_com, base_file, kmer_length=args.kmer_length, hash_size=args.hash_size, count_both_strands=args.count_both_strands)
def test_old_format(self): """Load existing index with correct format.""" d = SeqIO.index_db("Roche/triple_sff.idx", format="sff") self.assertEqual(54, len(d))
def test_old(self): """Load existing index with no options.""" d = SeqIO.index_db("Roche/triple_sff.idx") self.assertEqual(54, len(d))
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="format of file with sequences - default: fasta.") parser.add_argument("-i", "--input", action="store", dest="input", help="file with sequences") parser.add_argument("-o", "--output", action="store", dest="output", default="out.t", help="output file - default: out.t.") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") record_dict = SeqIO.index_db("temp_index.idx", [args.input], format=args.format) lengths_dict = SequenceRoutines.get_lengths(record_dict, out_file=out_fd) print("Longest sequence: %i" % max(lengths_dict.values())) print("Shortest sequence: %i" % min(lengths_dict.values())) print("Total length: %i" % sum(lengths_dict.values())) os.remove("temp_index.idx")
def distance_matrix_worker_old(self, seq_keys, length_threshold, dist_matrix, already_compared, lock, process_num, gb_dir): """ Worker process for make_distance_matrix(). Takes a list "already_compared" of sequences that have already had all pairwise comparisons. Each worker process will work making pairwise comparisons for a different sequence, adding them to the "already_compared" list as they are completed. """ # each process must load its own sqlite gb gb = SeqIO.index_db(gb_dir + "/gb.idx") process_num = str(process_num) i = 0 color = Color() for key in seq_keys: # check whether another process is already comparing this row compare_row = False with lock: if key not in already_compared: already_compared.append(key) compare_row = True if compare_row: # get the sequence record to compare record1 = gb[key] output_handle = open('query' + process_num + '.fasta', 'w') SeqIO.write(record1, output_handle, 'fasta') output_handle.close() j = 0 for key2 in seq_keys: # only calculate e-values for pairs that have not yet been compared if dist_matrix[i][j] == 99: if key == key2: row = dist_matrix[i] row[j] = 0.0 dist_matrix[i] = row # check sequence lengths else: # print("proc # = "+process_num+" i = "+str(i)+ " j = "+str(j)) record2 = gb[key2] length1 = len(record1.seq) length2 = len(record2.seq) # set distance to 50.0 if length similarity threshold not met if (length2 < length1 * (1 + float(length_threshold))) and (length2 > length1 * (1 - float(length_threshold))): row = dist_matrix[i] row[j] = 50.0 dist_matrix[i] = row row = dist_matrix[j] row[i] = 50.0 dist_matrix[j] = row else: # do the blast comparison output_handle = open('query' + process_num + '.fasta', 'w') SeqIO.write(record2, output_handle, 'fasta') output_handle.close() blastn_cmd = NcbiblastnCommandline(query='query' + process_num + '.fasta', subject='subject' + process_num + \ '.fasta', out='blast' + process_num + '.xml', outfmt=5) stdout, stderr = blastn_cmd() blastn_xml = open('blast' + process_num + '.xml', 'r') blast_records = NCBIXML.parse(blastn_xml) for blast_record in blast_records: if blast_record.alignments: if blast_record.alignments[0].hsps: # blast hit found, set distance to e-value row = dist_matrix[i] row[j] = blast_record.alignments[0].hsps[0].expect dist_matrix[i] = row row = dist_matrix[j] row[i] = blast_record.alignments[0].hsps[0].expect dist_matrix[j] = row else: # no blast hit found, set distance to default 10.0 row = dist_matrix[i] row[j] = 10.0 dist_matrix[i] = row row = dist_matrix[j] row[i] = 10.0 dist_matrix[j] = row blastn_xml.close() j += 1 i += 1 # update status percent = str(round(100 * len(already_compared)/float(len(seq_keys)), 2)) sys.stdout.write('\r' + color.blue + 'Completed: ' + color.red + str(len(already_compared)) + '/' + str(len(seq_keys)) + ' (' + percent + '%)' + color.done) sys.stdout.flush() # done looping through all keys, now clean up os.remove("blast" + process_num + ".xml") os.remove("query" + process_num + ".fasta") os.remove("subject" + process_num + ".fasta")
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [rec.id.lower() for rec in SeqIO.parse(h, format, alphabet)] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet)] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) self.assertEqual(set(id_list), set(rec_dict)) self.assertEqual(set(id_list), set(rec_dict_db)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertIn(key, rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(isinstance(raw, bytes), "Didn't get bytes from %s get_raw" % format) self.assertTrue(raw.strip()) self.assertIn(raw, raw_file) raw_db = rec_dict_db.get_raw(key) # Via index using format-specific get_raw which scans the file, # Via index_db in general using raw length found when indexing. self.assertEqual(raw, raw_db, "index and index_db .get_raw() different for %s" % format) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(b"<entry ")) self.assertTrue(raw.endswith(b"</entry>")) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Author: Kevin Lamkiewicz # Email: [email protected] from Bio import SeqIO import time import sys import pickle import glob import re files = glob.glob(f"{sys.argv[1]}/gbvrl*.seq") gb_vrl = SeqIO.index_db(f"{sys.argv[1]}/gbvrl.idx", files, "genbank") countryRegex = re.compile(r'country="([^"]+)"') accessionDateRegex = re.compile(r'collection_date="([^"]+)"') d_metaInformation = {} for accession in gb_vrl: genbankEntry = gb_vrl.get_raw(accession).decode("utf-8") country = re.findall(countryRegex, genbankEntry) if country: country = country[0].replace('\n', ' ').replace(' ', '_') else: country = '--' accessionDate = re.findall(accessionDateRegex, genbankEntry)