Example #1
0
def intersect_keys(keys, reffile, cache=False):
    """Extract SeqRecords from the index by matching keys."""
    # Build/load the index of reference sequences
    index = None
    if cache:
        refcache = reffile + '.sqlite'
        if os.path.exists(refcache):
            if os.stat(refcache).st_mtime < os.stat(reffile).st_mtime:
                logging.warn("Outdated cache; rebuilding index")
            else:
                try:
                    index = SeqIO.index_db(refcache)
                except Exception:
                    logging.warn("Skipping corrupted cache; rebuilding index")
                    index = None
    else:
        refcache = ':memory:'
    if index is None:
        # Rebuild the index, for whatever reason
        index = SeqIO.index_db(refcache, [reffile], 'fasta')

    # Extract records by key
    for key in keys:
        try:
            record = index[key]
        except LookupError:
            # Missing keys are rare, so it's faster not to check every time
            logging.info("No match: %s", repr(key))
            continue
        yield record
    def key_check(self, filename, format, alphabet, comp):
        """Check indexing with a key function."""
        if comp:
            h = gzip_open(filename, format)
            id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            id_list = [rec.id for rec in SeqIO.parse(filename, format, alphabet)]

        key_list = [add_prefix(id) for id in id_list]

        with warnings.catch_warnings():
            if "_alt_index_" in filename:
                # BiopythonParserWarning: Could not parse the SFF index:
                # Unknown magic number b'.diy' in SFF index header:
                # b'.diy1.00'
                warnings.simplefilter("ignore", BiopythonParserWarning)

            rec_dict = SeqIO.index(filename, format, alphabet, add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            del rec_dict

            if not sqlite3:
                return

            # In memory,
            rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet, add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            # check error conditions
            self.assertRaises(ValueError, SeqIO.index_db, ":memory:", format="dummy", key_function=add_prefix)
            self.assertRaises(ValueError, SeqIO.index_db, ":memory:", filenames=["dummy"], key_function=add_prefix)
            rec_dict.close()
            del rec_dict

            # Saving to file...
            index_tmp = filename + ".key.idx"
            if os.path.isfile(index_tmp):
                os.remove(index_tmp)
            rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload it...
            rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet, add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload without passing filenames and format
            rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet, key_function=add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict
            os.remove(index_tmp)
Example #3
0
    def key_check(self, filename, format, alphabet, comp):
        """Check indexing with a key function."""
        if comp:
            h = gzip_open(filename, format)
            id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            id_list = [rec.id for rec in SeqIO.parse(filename, format, alphabet)]

        key_list = [add_prefix(id) for id in id_list]
        rec_dict = SeqIO.index(filename, format, alphabet, add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        del rec_dict

        if not sqlite3:
            return

        #In memory,
        rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        #check error conditions
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", format="dummy",
                          key_function=add_prefix)
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", filenames=["dummy"],
                          key_function=add_prefix)
        rec_dict.close()
        del rec_dict

        #Saving to file...
        index_tmp = filename + ".key.idx"
        if os.path.isfile(index_tmp):
            os.remove(index_tmp)
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict

        #Now reload it...
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict

        #Now reload without passing filenames and format
        rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet,
                                  key_function=add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict
        os.remove(index_tmp)
Example #4
0
    def simple_check(self, filename, format, alphabet):
        """Check indexing (without a key function)."""
        id_list = [rec.id for rec in SeqIO.parse(filename, format, alphabet)]

        rec_dict = SeqIO.index(filename, format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict._proxy._handle.close() #TODO - Better solution
        del rec_dict

        if not sqlite3:
            return

        #In memory,
        #note here give filenames as list of strings
        rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        del rec_dict

        #check error conditions
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", format="dummy")
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", filenames=["dummy"])

        #Saving to file...
        index_tmp = filename + ".idx"
        if os.path.isfile(index_tmp):
            os.remove(index_tmp)

        #To disk,
        #note here we give the filename as a single string
        #to confirm that works too (convience feature).
        rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        rec_dict._con.close() #hack for PyPy
        del rec_dict

        #Now reload it...
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        rec_dict._con.close() #hack for PyPy
        del rec_dict

        #Now reload without passing filenames and format
        rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        rec_dict._con.close() #hack for PyPy
        del rec_dict
        os.remove(index_tmp)
Example #5
0
def makeSQLindex(infiles=None, data_inpath='', mode='grouped', outname=None):
    ''' Creates an SQL index out of either an uncompressed file or a compressed .bgzf file 
    
    if infiles is a string it is interpreted as a glob
    
    if infiles is list, goes through all file names in list.
    
     - mode  - grouped: all files are indexed to a single index file, specified by outname 
    
    '''
    
    starting_dir = os.getcwd()
    
    if data_inpath:
        os.chdir(data_inpath)
        
    if outname is None:
        outname = 'reads.idx'
  
    if type(infiles) is str:
        # Fetch files by file types using glob
        import glob 
        infiles = glob.glob(infiles)
    elif type(infiles) is not list and type(infiles) is not tuple:
        raise Exception("Invalid input files specified.")

    assert infiles, 'No files found, or no files passed.'

    # Handle multiple types of input for infiles
    if mode == 'grouped':
        idx_filename = outname
        tak = time.time()
        print 'Writing {0} files to SQL index ...'.format(len(infiles))
        SeqIO.index_db(idx_filename, infiles , 'fastq')
        idx_t = time.time() - tak
        print 'Finished Indexing to {0}\n after {1}\n'.format(idx_filename, time.strftime('%H:%M:%S', time.gmtime(idx_t)))

    elif mode == 'separate':
    
        for filename in infiles: 
            tak = time.time()
            print 'Writing SQL index file for {0} ...'.format(filename)
            idx_filename = filename.split('.')[0] + '.idx'
            SeqIO.index_db(idx_filename, filename , 'fastq')
            print '{0} written successfully'.format(idx_filename)
            idx_t = time.time() - tak
            print 'Finished Indexing after {1}\n'.format(time.strftime('%H:%M:%S', time.gmtime(idx_t)))

    if os.getcwd() != starting_dir:
        os.chdir(starting_dir) 
    def key_check(self, filename, format, alphabet):
        """Check indexing with a key function."""
        if format in SeqIO._BinaryFormats:
            mode = "rb"
        else :
            mode = "r"

        id_list = [rec.id for rec in \
                   SeqIO.parse(open(filename, mode), format, alphabet)]

        key_list = [add_prefix(id) for id in id_list]
        rec_dict = SeqIO.index(filename, format, alphabet, add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)

        if not sqlite3:
            return

        #In memory,
        rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        #check error conditions
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", format="dummy",
                          key_function=add_prefix)
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", filenames=["dummy"],
                          key_function=add_prefix)

        #Saving to file...
        index_tmp = filename + ".key.idx"
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        del rec_dict
        #Now reload it...
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        del rec_dict
        #Now reload without passing filenames and format
        rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet,
                                  key_function=add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        del rec_dict
        os.remove(index_tmp)
Example #7
0
 def load(self, files, dbname=None):
     if isinstance(files, basestring): files = [files]
     self.close()
     valid = []
     schemas = set()
     for filename in files:
         if not os.path.isfile(filename):
             print 'No such file: %s' % filename 
             continue
         schema = SeqLoader.guess_schema(filename)
         if not schema:
             print 'Unable to guess schema from filename: %s' % filename
             continue
         schemas.add(schema)
         valid.append(filename)
     if len(schemas) != 1:
         raise ValueError('All files should be of the same type, but %d types found: %s' % (len(schemas), schemas))
     if not valid:
         print 'No valid files provided.'
         return False
     if not dbname:
         self.dbname = mktmp_name('_SeqView.db')
         safe_unlink(self.dbname)
         self.tmp_db = True
     else: self.dbname = dbname
     self.db = SeqIO.index_db(self.dbname, valid, schemas.pop())
     self._ids = tuple(sorted(self.db.keys()))
     self.master = True
     return bool(self)
Example #8
0
def _unpickle_SeqView(dbname, ids, upper):
    v = SeqView(upper)
    v.dbname = dbname
    v.db = SeqIO.index_db(dbname)
    v.master = True
    v._ids = ids
    return v
Example #9
0
 def reload(self, dbname):
     self.close()
     self.dbname = dbname
     self.db = SeqIO.index_db(self.dbname)
     self._ids = sorted(self.db.keys())
     self.tmp_db = False
     self.master = True
Example #10
0
    def extract_estseq(self):

        '''Function to extract and print the ESTSeq slice.'''

        self.logger.debug("Extracting the slice from ESTSeq.")
        estseqOut = os.path.join(
            self.dataFolder, "{0}.{1}.fa".format(self.new_name,
                                                 self.estseq_alias))

        if os.path.exists(estseqOut) and os.path.getsize(estseqOut)>0:
            self.logger.warn("ESTSeq file for alias {0} already present. Exiting".format(
                    self.estseq_alias))
            return
    
        index=SeqIO.index_db(self.estseq)
        if self.chrom not in index:
            self.logger.critical("Chromosome {0} not in ESTSeq. Exiting.".format(self.chrom))
            self.failed=True
            return
        start = max(self.start-self.flank-1, 0)
        end = min(len(index[self.chrom])-1, 
                  self.end-1+self.flank)
    
        seq=index[self.chrom][start:end]
        with open(estseqOut,'w') as out:
            seq.id=self.new_name
            seq.description=""
            print(seq.format('fasta'), file=out)

        self.logger.debug("Finished extracting the CONSeq sequence.")
        return
Example #11
0
 def test_old_files_same_dir(self):
     """Load existing index with correct files (from same directory)."""
     os.chdir("Roche")
     d = SeqIO.index_db("triple_sff.idx",
                        ["E3MFGYR02_no_manifest.sff", "greek.sff", "paired.sff"])
     self.assertEqual(54, len(d))
     self.assertEqual(395, len(d["alpha"]))
Example #12
0
def filter_clusters2(cluster_filepath, idx_filepath, size_range, output_dirpath):
    """ Writes a subset of cluster sizes to FastQ files 
    
    The representative sequence is the first sequence record written.
    
    make the sequence record instead of passing it.
    
    """
    
    starting_dir = os.getcwd()
    idx_dir = os.path.split(idx_filepath)[0]
    
    # Check and create directory 
    if not os.path.exists(output_dirpath):
        os.makedirs(output_dirpath)
        
    cluster_gen = parse(cluster_filepath, idx_filepath)
    seqrec_lookup = SeqIO.index_db(idx_filepath)
    
    size_counter = Counter()
    
    for cluster in cluster_gen:
        # Check if cluster size is within defined range
        if cluster.size > size_range[0] and cluster.size < size_range[1]:
            
            size_counter[cluster.size] += 1
            
            # Get the sequence records for the cluster 
            if os.getcwd() != idx_dir:
                os.chdir(idx_dir)
            # Representative sequence first 
            seqrecord = seqrec_lookup[cluster.rep_seq_id]
            
            if os.getcwd() != output_dirpath:
                os.chdir(output_dirpath)
            # Write cluster to a file 
            fname = "clustersize{0}-No{1}.fastq".format(str(cluster.size), str(size_counter[cluster.size]))
            
            if os.path.isfile(fname):
                output_handle = open(fname, "wb")
                output_handle.close()
            
            output_handle = open(fname, "a")
            SeqIO.write(seqrecord, output_handle, "fastq")
            
            for member in cluster.members_id :
                
                if os.getcwd() != idx_dir:
                    os.chdir(idx_dir)
                # Representative sequence first 
                seqrecord = seqrec_lookup[member]
            
                if os.getcwd() != output_dirpath:
                    os.chdir(output_dirpath)
                # Write sequence record to file 
                SeqIO.write(seqrecord, output_handle, "fastq")
                
    if os.getcwd() != starting_dir: 
        os.chdir(starting_dir)
Example #13
0
 def test_order_index_db(self):
     """Check index_db preserves order in multiple indexed files."""
     files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"]
     ids = []
     for f in files:
         ids.extend(r.id for r in SeqIO.parse(f, "fasta"))
     d = SeqIO.index_db(":memory:", files, "fasta")
     self.assertEqual(ids, list(d))
    def simple_check(self, filename, format, alphabet):
        """Check indexing (without a key function)."""
        if format in SeqIO._BinaryFormats:
            mode = "rb"
        else :
            mode = "r"

        id_list = [rec.id for rec in \
                   SeqIO.parse(open(filename, mode), format, alphabet)]

        rec_dict = SeqIO.index(filename, format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)

        if not sqlite3:
            return

        #In memory,
        #note here give filenames as list of strings
        rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        #check error conditions
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", format="dummy")
        self.assertRaises(ValueError, SeqIO.index_db,
                          ":memory:", filenames=["dummy"])

        #Saving to file...
        index_tmp = filename + ".idx"
        #To disk,
        #note here we give the filename as a single string
        #to confirm that works too (convience feature).
        rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        del rec_dict
        #Now reload it...
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        del rec_dict
        #Now reload without passing filenames and format
        rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        del rec_dict
        os.remove(index_tmp)
Example #15
0
def indexer(index_name):
    
    '''Set up an index to search a large fasta file. We are going to try to use the SeqIO.index_db
    because it creates (and subsequently loads) a SQLite database on disk instead of a giant
    2GB thing in memory.'''
    
    index=SeqIO.index_db(index_name, 'fasta')
    return index
Example #16
0
def intersect_keys(keys, reffile, cache=False, clean_accs=False):
    """Extract SeqRecords from the index by matching keys.

    keys - an iterable of sequence identifiers/accessions to select
    reffile - name of a FASTA file to extract the specified sequences from
    cache - save an index of the reference FASTA sequence offsets to disk?
    clean_accs - strip HMMer extensions from sequence accessions?
    """
    # Build/load the index of reference sequences
    index = None
    if cache:
        refcache = reffile + '.sqlite'
        if os.path.exists(refcache):
            if os.stat(refcache).st_mtime < os.stat(reffile).st_mtime:
                logging.warn("Outdated cache; rebuilding index")
            else:
                try:
                    index = (SeqIO.index_db(refcache,
                                            key_function=clean_accession)
                             if clean_accs
                             else SeqIO.index_db(refcache))

                except Exception:
                    logging.warn("Skipping corrupted cache; rebuilding index")
                    index = None
    else:
        refcache = ':memory:'
    if index is None:
        # Rebuild the index, for whatever reason
        index = (SeqIO.index_db(refcache, [reffile], 'fasta',
                                key_function=clean_accession)
                 if clean_accs
                 else SeqIO.index_db(refcache, [reffile], 'fasta'))

    # Extract records by key
    if clean_accs:
        keys = (clean_accession(k) for k in keys)
    for key in keys:
        try:
            record = index[key]
        except LookupError:
            # Missing keys are rare, so it's faster not to check every time
            logging.info("No match: %s", repr(key))
            continue
        yield record
Example #17
0
 def check(self, index_file, sff_files):
     if os.path.isfile(index_file):
         os.remove(index_file)
     # Build index...
     d = SeqIO.index_db(index_file, sff_files, "sff")
     self.assertEqual(395, len(d["alpha"]))
     d._con.close()  # hack for PyPy
     d.close()
     self.assertEqual([os.path.abspath(f) for f in sff_files],
                      [os.path.abspath(f) for f in d._filenames])
     # Load index...
     d = SeqIO.index_db(index_file, sff_files)
     self.assertEqual(395, len(d["alpha"]))
     d._con.close()  # hack for PyPy
     d.close()
     self.assertEqual([os.path.abspath(f) for f in sff_files],
                      [os.path.abspath(f) for f in d._filenames])
     os.remove(index_file)
Example #18
0
 def _init_storage(self, genome):
     """Load sequences from genome, their sizes and init links"""
     # load fasta into index
     self.sequences = SeqIO.index_db(genome+".db3", genome, 'fasta')
     self.seq = self.sequences
     # prepare storage
     self.contigs = {c: len(self.seq[c]) for c in self.seq} # this is very ineffective as it loads the record!
     self.links   = {c: [{}, {}] for c in self.contigs}
     self.ilinks  = 0
 def find_and_open(self, input_directory):
     filename = self.get_filename(input_directory)
     if not os.path.isfile(filename):
         return False
     else:
         self.filename = filename
         self.index_filename = filename.replace('.fasta', '.idx')
         self.index = SeqIO.index_db(self.index_filename, self.filename, 'fasta', generic_protein)
         self._open = True
         return True
Example #20
0
def main(args, loglevel):

    logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel)

    if args.seqfile[-3:] == '.gz':
        fh = gzip.open(args.seqfile, 'rt')
    else:
        fh = open(args.seqfile, 'r')

    for line in fh:
        if line[0] == '>':
            filetype = 'fasta'
            break
        elif line[0] == '@':
            filetype = 'fastq'
            break
        else:
            raise RuntimeError("Cannot guess file type for %s" % args.seqfile)
    fh.close()

    logging.info("Indexing {} file {}".format(filetype, args.seqfile))

    record_dict = SeqIO.index_db(args.seqfile + '.idx', args.seqfile, filetype)

    logging.info("Reading filter file {}".format(args.filterfile))
    with open(args.filterfile, 'r') as fh:
        id_count = sum(1 for line in fh)
        fh.seek(0)
        start_time = time.time()
        try:
            for i, line in enumerate(fh):
                if i % 1000 == 0:
                    try:
                        rate = i / (time.time()-start_time)
                        time_remain = (id_count - i) / rate
                    except ZeroDivisionError:
                        rate = 0
                        time_remain = 0
                    logging.info("Processed {} of {} IDs ({:.2f} per second, est. complete at {})".
                                 format(i, id_count, rate, time.asctime(time.localtime(time.time() + time_remain))))
                try:
                    rec = record_dict[line.rstrip()]
                    print(rec.format(filetype))
                except KeyError:
                    logging.debug('record id {} not found'.format(line.strip()))
                    pass
        except IOError:
            try:
                sys.stdout.close()
            except IOError:
                pass
            try:
                sys.stderr.close()
            except IOError:
                pass
Example #21
0
    def __init__(self, guide_seq, all_seq_keys, length_threshold, evalue_threshold, gb_dir, num_cores):
        """
        Input: name of FASTA file containing guide sequences, dictionary of all GenBank sequences,
        a list of ingroup/outgroup sequences, the e-value threshold to cluster, and the
        threshold of sequence length percent similarity to cluster taxa,
        and the GenBank directory.
        Generates a list of clusters (each cluster is itself a list of keys to sequences).
        """
        ClusterBuilder.__init__(self, all_seq_keys)
        
        lock = multiprocessing.Lock()
        manager = multiprocessing.Manager()
        already_compared = manager.list()
        clusters = manager.list()

        color = Color()
        # check for fasta file of guide sequences
        if not os.path.isfile(guide_seq):
            print(color.red + "FASTA file of guide sequences not found. Please re-try." + color.done)
            sys.exit(0)
        else:
            # initialize an empty list for each cluster
            guide_sequences = SeqIO.parse(open(guide_seq, "rU"), "fasta")
            for guide in guide_sequences:
                clusters.append([])

        # make blast database
        gb = SeqIO.index_db(gb_dir + "/gb.idx")
        output_handle = open('blast_db.fasta', 'w')
        records = []
        for key in all_seq_keys:
            record = gb[key]
            records.append(record)
        SeqIO.write(records, output_handle, 'fasta')
        output_handle.close()

        # spawn processes
        print(color.blue + "Spawning " + color.red + str(num_cores) + color.blue + " processes to make clusters." + color.done)
        processes = []
        
        for i in range(num_cores):
            p = multiprocessing.Process(target=self.make_guided_clusters_worker, args=(guide_seq, all_seq_keys, \
                length_threshold, evalue_threshold, clusters, already_compared, lock, i, gb_dir))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
        
        sys.stdout.write("\n")
        sys.stdout.flush()
        self.clusters = clusters
        if os.path.isfile("blast_db.fasta"):
            os.remove("blast_db.fasta")
Example #22
0
File: genbank.py Project: wf8/sumac
 def sqlite(path):
     """
     Sets up the SQLite db for the GenBank division.
     Path is the absolute path of the GB files.
     Returns a dictionary of SeqRecord objects.
     """
     color = Color()
     if os.path.exists(path + "/gb.idx"):
         print(color.purple + "Genbank database already downloaded. Indexing sequences..." + color.done)
         return SeqIO.index_db(path + "/gb.idx")
     else:
         files = os.listdir(path)
         path_files = []
         if len(files) == 0:
             print(color.red + "GenBank files not found. Re-download with the -d option. See --help for more details." + color.done)
             sys.exit(0)
         for file in files:
             path_files.append(path + "/" + file)
         print(color.purple + "Genbank database already downloaded. Indexing sequences..." + color.done)
         return SeqIO.index_db(path + "/gb.idx", path_files, "genbank")
Example #23
0
def fasta2homozygous(out, fasta, identity, overlap, minLength, \
                     libraries, limit, \
                     threads=1, joinOverlap=200, endTrimming=0, verbose=0):
    """Parse alignments and report homozygous contigs"""
    #create/load fasta index
    if verbose:
        sys.stderr.write("Indexing fasta...\n")
    faidx = SeqIO.index_db(fasta.name+".db3", fasta.name, "fasta")
    genomeSize = sum(len(faidx[c]) for c in faidx) 

    # depth-of-coverage info
    c2cov, covTh = None, None
    if libraries:
        c2cov, covTh = get_coverage(faidx, fasta.name, libraries, limit, \
                                    verbose)
    # run blat for identity >= 0.85
    similarity, name = blat, "BLAT"
    # or run last for more diverged haplotypes
    if identity < 0.85:
        similarity, name = last_single, "LAST"
        # multi-threading on python 2.7+ only, as 2.6 stalls
        if threads > 1 and sys.version_info[0] == 2 \
           and sys.version_info[1] > 6:
            similarity, name = last_multi, "multithreaded LAST"
    #run blat
    psl = fasta.name + ".psl.gz"
    if not os.path.isfile(psl):
        if verbose:
            sys.stderr.write("Running %s...\n"%name)
        similarity(fasta.name, identity, threads, verbose)
    
    if verbose:
        sys.stderr.write("Parsing alignments...\n")
    #filter alignments
    hits, overlapping = psl2hits(psl, identity, overlap, joinOverlap, endTrimming)

    #remove redundant
    ## maybe store info about removed also
    contig2skip, identity = hits2skip(hits, faidx, verbose)
    #print "\n".join("\t".join(map(str, x)) for x in overlapping[:100]); return
    
    #report homozygous fasta
    nsize, k, skipped, ssize, merged = merge_fasta(out, faidx, contig2skip, \
                                                   overlapping, minLength, verbose)
    
    #summary    
    info = "%s\t%s\t%s\t%s\t%.2f\t%s\t%.2f\t%.3f\t%s\t%s\t%.2f\t%s\t%.2f\n"
    sys.stderr.write(info%(fasta.name, genomeSize, len(faidx), ssize, 100.0*ssize/genomeSize, \
                           skipped, 100.0*skipped/len(faidx), identity, len(merged), \
                           nsize, 100.0*nsize/genomeSize, k, 100.0*k/len(faidx)))

    return genomeSize, len(faidx), ssize, skipped, identity
def make_contigs_and_split_hints(input_filename, output_filenames, genome_filename):
    prefix = input_filename.replace('.sorted.hints', '')
    genome_filename = genome_filename.replace(prefix, '')
    index_filename = re.sub(FASTA_RE,'.idx', genome_filename)
    hints_file = open(input_filename)
    contig_list_filename = output_filenames[0].replace(prefix, '')
    contig_list_output_file = open(contig_list_filename,'w')
    genome_dict = SeqIO.index_db(index_filename)
    current_contig = None
    contigs_seen = set()
    for line in hints_file:
        if line.startswith('#'):
            continue
        fields = line.split('\t')
        assert len(fields) == 9, 'invalid hints format, expected 9 fields, got this line: {}'.format(line)
        contig_name = fields[0]
        if contig_name != current_contig:
            contigs_seen.add(contig_name)
            if contig_name.startswith('@unitig'):
                match = unitig_re.match(contig_name)
                output_prefix = 'unitig_' + match.group(1)
            else:
                output_prefix = contig_name
            hints_output_file = open(output_prefix + HINTS_SUFFIX,'w')
            contig_output_file = open(output_prefix + CONTIG_SUFFIX, 'w')
            contig_seq = genome_dict[contig_name]
            SeqIO.write(contig_seq, contig_output_file, 'fasta')
            contig_output_file.close()
            contig_list_output_file.write('\t'.join([contig_name, output_prefix + HINTS_SUFFIX, output_prefix + CONTIG_SUFFIX]) + '\n')
            current_contig = contig_name
        hints_output_file.write(line)
    hints_output_file.close()
    # write out all the contigs for which we have no hints, 
    # along with blank hints files
    for contig_name in genome_dict.keys():
        if not contig_name in contigs_seen:
            #TODO: make this into a function, we're re-using code here
            if contig_name.startswith('@unitig'):
                match = unitig_re.match(contig_name)
                output_prefix = 'unitig_' + match.group(1)
            else:
                output_prefix = contig_name
            hints_output_file = open(output_prefix + HINTS_SUFFIX,'w')
            hints_output_file.close() # write a blank hints file
            contig_output_file = open(output_prefix + CONTIG_SUFFIX, 'w')
            contig_seq = genome_dict[contig_name]
            SeqIO.write(contig_seq, contig_output_file, 'fasta')
            contig_output_file.close()
            contig_list_output_file.write('\t'.join([contig_name, output_prefix + HINTS_SUFFIX, output_prefix + CONTIG_SUFFIX]) + '\n')            
    contig_list_output_file.close()
Example #25
0
def fastq2random(outbase, files, n, verbose, seqformat='fastq'):
    """Return number of random reads from FastQ file(s)"""
    #generate indexes
    if verbose:
        sys.stderr.write("Generating indexes...\n")
    indexes = [] 
    for i, f in enumerate(files, 1):
        sys.stderr.write(" %s \r"%i)
        c = Counter() #counters.append(Counter())
        index = SeqIO.index_db(f.name+'.idx', f.name, seqformat, key_function=c.count)
        index.close()

    #get random entries
    store_random_entries(outbase, files, n, verbose)
        def check(self, index_file, sff_files, expt_sff_files):
            if os.path.isfile(index_file):
                os.remove(index_file)
            # Build index...
            d = SeqIO.index_db(index_file, sff_files, "sff")
            self.assertEqual(395, len(d["alpha"]))
            d._con.close()  # hack for PyPy
            d.close()
            self.assertEqual([os.path.abspath(f) for f in sff_files], [os.path.abspath(f) for f in d._filenames])

            # Now directly check the filenames inside the SQLite index:
            filenames, flag = raw_filenames(index_file)
            self.assertEqual(flag, True)
            self.assertEqual(filenames, expt_sff_files)

            # Load index...
            d = SeqIO.index_db(index_file, sff_files)
            self.assertEqual(395, len(d["alpha"]))
            d._con.close()  # hack for PyPy
            d.close()
            self.assertEqual([os.path.abspath(f) for f in sff_files], d._filenames)

            os.remove(index_file)
Example #27
0
def get_seq(args, seqname, start = 1 , end = None, strand = 1):
  """builds a biopython database file for the subject sequences and retrieves the specified
  portion of the specified sequence, reverse-complementing if necessary and translating
  dna sequences."""
  sequence_db = SeqIO.index_db(args.index_filename, args.seqfilename, 'fasta')
  #seq = sequence_db[seqname][start-1:end].seq
  seq = sequence_db[seqname][start-1:end].seq
  if strand < 0:
    seq = seq.reverse_complement()
  if args.translate and args.program == 'tblastn':
    while len(seq) % 3:
      seq = seq[:-1]
    seq = seq.translate()
  return seq  
Example #28
0
def main(argv):
  blastfilename = ''
  seqfilename = ''
  try:
      opts, args = getopt.getopt(argv,"hb:s:",["seqfile=","blastfile="])
  except getopt.GetoptError:
    print 'Type Blast2OrthologGroups.py -h for options'
    sys.exit(2)
  for opt, arg in opts:
    if opt == "-h":
       print 'Blast2OrthologGroups.py -b <blastfile> -s <seqfile>'
       sys.exit()
    elif opt in ("-b", "--blastfile"):
       blastfilename = arg
    elif opt in ("-s", "--seqfile"):
       seqfilename = arg
  #Exclude sequences that cluster with chloroplast-encoded genes     
  chloroplast_clusters = ("Cluster_10175", "Cluster_10226", "Cluster_10232", "Cluster_10485", "Cluster_10984", "Cluster_11190", "Cluster_11826", "Cluster_11977", "Cluster_11980", "Cluster_12324", "Cluster_12326", "Cluster_12658", "Cluster_12697", "Cluster_1272", "Cluster_12984", "Cluster_13661", "Cluster_13666", "Cluster_13836", "Cluster_14312", "Cluster_14375", "Cluster_14565", "Cluster_1826", "Cluster_2867", "Cluster_29556", "Cluster_4023", "Cluster_4421", "Cluster_5258", "Cluster_5739", "Cluster_5972", "Cluster_615", "Cluster_7013", "Cluster_7860", "Cluster_8325", "Cluster_8384")     
  cluster_info = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "RefSeq", "SeqClusters.p")
  seq_groups = pickle.load( open( cluster_info, "rb" ) )
  indexfilename = seqfilename + ".inx"
  new_seqs = SeqIO.index_db(indexfilename, seqfilename, "fasta")
  used_seqs = {}
  with open(blastfilename, 'rU') as f:
    reader=csv.reader(f,delimiter='\t')
    for row in reader:
      qseqid, qlen, sacc, slen, pident, length, mismatch, gapopen, qstart, qend, qframe, sstart, send, sframe, evalue, bitscore = row
      if sacc in seq_groups and not used_seqs.has_key(qseqid) and not seq_groups[sacc] in chloroplast_clusters:
        used_seqs[qseqid] = 1
        seq = new_seqs[qseqid]
        #cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "RefSeq", seq_groups[sacc] + ".fa")      
        cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "ContigClusters", seq_groups[sacc] + ".fa")     
        cluster_file = open(cluster_filename, "a")
        #print "saving %s to %s" % (qseqid, cluster_filename)
        id = seq.id
        if int(send) < int(sstart):
          seq = seq.reverse_complement()
          id = id + '_rc'
        seq.id, seq.description = id, id
        cluster_file.write(seq.format("fasta"))
        cluster_file.close()
Example #29
0
    def extract_conseq(self):
        '''Method to extract the CONSeq slice from a CONSeq genome file.'''


        self.logger.debug("Extracting the slice from CONSeq.")
        conseqOut = os.path.join(
            self.dataFolder, "{0}.{1}.conseq".format(self.new_name,
                                                     self.conseq_alias))
        if os.path.exists(conseqOut) and os.path.getsize(conseqOut)>0:
            self.logger.warn("CONSeq file for alias {0} already present. Exiting".format(self.conseq_alias))
            return

        try:
            index = SeqIO.index_db(self.conseq)
        except Exception as error:
            self.logger.exception(error)
            self.logger.error("Conseq is: {0}".format(self.conseq))
            self.failed=True
            return


        if self.chrom not in index:
            self.logger.critical("Chromosome {0} not in Conseq file. Exiting.".format(self.chrom))
            self.failed=True
            return

        #Establish boundaries
        start = max(self.start-self.flank-1, 0)
        end = min(len(index[self.chrom])-1, 
                  self.end-1+self.flank)
    
        seq=index[self.chrom][start:end]
        with open(conseqOut,'w') as out:
            print(seq.seq, file=out)

        self.logger.debug("Finished extracting the CONSeq sequence.")
        return
Example #30
0
    def distance_matrix_worker(self, seq_keys, length_threshold, dist_matrix, already_compared, lock, process_num, gb_dir):
        """
        Worker process for make_distance_matrix(). Takes a list "already_compared" of sequences that have
        already had all pairwise comparisons. Each worker process will work making pairwise comparisons
        for a different sequence, adding them to the "already_compared" list as they are completed.
        """
        # each process must load its own sqlite gb
        gb = SeqIO.index_db(gb_dir + "/gb.idx")
        process_num = str(process_num)
        i = 0
        color = Color()
        for key in seq_keys:
            # check whether another process is already comparing this row
            compare_row = False
            with lock:
                if key not in already_compared:
                    already_compared.append(key)
                    compare_row = True
            if compare_row:
                
                # make the blast query
                record1 = gb[key]
                output_handle = open('query' + process_num + '.fasta', 'w')
                SeqIO.write(record1, output_handle, 'fasta')
                output_handle.close()
                
                # make blast database
                j = 0
                output_handle = open('blast_db' + process_num + '.fasta', 'w')
                records = []
                for key2 in seq_keys:
                    # only add sequences that have not yet been compared
                    if j > i:
                        record = gb[key2]
                        records.append(record)
                    if j == i:
                        row = dist_matrix[i]
                        row[j] = 0.0
                        dist_matrix[i] = row
                    j += 1
                SeqIO.write(records, output_handle, 'fasta')
                output_handle.close()

                if len(records) > 0:
                    # blast query against blast_db
                    blastn_cmd = NcbiblastnCommandline(query='query' + process_num + '.fasta', subject='blast_db' + process_num + '.fasta', \
                        out='blast' + process_num + '.xml', outfmt=5)
                    stdout, stderr = blastn_cmd()

                    # parse blast output
                    j = i + 1
                    blastn_xml = open('blast' + process_num + '.xml', 'r')
                    blast_records = NCBIXML.parse(blastn_xml)
                    for blast_record in blast_records:
                        for alignment in blast_record.alignments:
                            # loop through each high-scoring segment pair (HSP)
                            for hsp in alignment.hsps:
                                length1 = len(record1.seq)
                                length2 = alignment.length
                                # first check if length similarity threshold met
                                if (length1 < length2 * (1 + float(length_threshold))) and (length1 > length2 * (1 - float(length_threshold))):
                                    # blast hit found, set distance to e-value
                                    row = dist_matrix[i]
                                    row[j] = hsp.expect
                                    dist_matrix[i] = row
                                    row = dist_matrix[j]
                                    row[i] = hsp.expect
                                    dist_matrix[j] = row
                                else:
                                    # set distance to 50.0 if length similarity threshold not met
                                    row = dist_matrix[i]
                                    row[j] = 50.0
                                    dist_matrix[i] = row
                                    row = dist_matrix[j]
                                    row[i] = 50.0
                                    dist_matrix[j] = row
                        j += 1
                    blastn_xml.close()
            i += 1
            # update status
            percent = str(round(100 * len(already_compared)/float(len(seq_keys)), 2))
            sys.stdout.write('\r' + color.blue + 'Completed: ' + color.red + str(len(already_compared)) + '/' + str(len(seq_keys)) + ' (' + percent + '%)' + color.done)
            sys.stdout.flush()
        # done looping through all keys, now clean up
        if os.path.exists('blast_db' + process_num + '.fasta'):
            os.remove('blast_db' + process_num + '.fasta')
        if os.path.exists("blast" + process_num + ".xml"):
            os.remove("blast" + process_num + ".xml")
        if os.path.exists("query" + process_num + ".fasta"):
            os.remove("query" + process_num + ".fasta")
        if os.path.exists("subject" + process_num + ".fasta"):
            os.remove("subject" + process_num + ".fasta")
Example #31
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = self.check_path(output_dir)

        for species in clusters_dict:
            idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (self.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = SeqIO.index_db(
                idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_super_dict[species], seqeuence_names[species]),
                            out_file,
                            format=sequence_file_format)
        elif mode == "families":

            def per_family_record_generator(seq_super_dict, clust_dict,
                                            cluster_id):
                if species_label_first:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        label, separator_for_labeling, name)
                else:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        name, separator_for_labeling, label)

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(
                                seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]

            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                SeqIO.write(per_family_record_generator(
                    sequence_super_dict, clusters_dict, cluster_name),
                            out_file,
                            format=sequence_file_format)

        for species in clusters_dict:
            os.remove("%s_tmp.idx" % species)
Example #32
0
 def test_old_same_dir(self):
     """Load existing index with no options (from same directory)."""
     os.chdir("Roche")
     d = SeqIO.index_db("triple_sff.idx")
     self.assertEqual(54, len(d))
     self.assertEqual(395, len(d["alpha"]))
Example #33
0
    def key_check(self, filename, format, alphabet, comp):
        """Check indexing with a key function."""
        if comp:
            h = gzip_open(filename, format)
            id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            id_list = [
                rec.id for rec in SeqIO.parse(filename, format, alphabet)
            ]

        key_list = [add_prefix(id) for id in id_list]
        rec_dict = SeqIO.index(filename, format, alphabet, add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        del rec_dict

        if not sqlite3:
            return

        #In memory,
        rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        #check error conditions
        self.assertRaises(ValueError,
                          SeqIO.index_db,
                          ":memory:",
                          format="dummy",
                          key_function=add_prefix)
        self.assertRaises(ValueError,
                          SeqIO.index_db,
                          ":memory:",
                          filenames=["dummy"],
                          key_function=add_prefix)
        rec_dict.close()
        del rec_dict

        #Saving to file...
        index_tmp = filename + ".key.idx"
        if os.path.isfile(index_tmp):
            os.remove(index_tmp)
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict

        #Now reload it...
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                  add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict

        #Now reload without passing filenames and format
        rec_dict = SeqIO.index_db(index_tmp,
                                  alphabet=alphabet,
                                  key_function=add_prefix)
        self.check_dict_methods(rec_dict, key_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict
        os.remove(index_tmp)
Example #34
0
    def simple_check(self, filename, format, alphabet, comp):
        """Check indexing (without a key function)."""
        if comp:
            h = gzip_open(filename, format)
            id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            id_list = [
                rec.id for rec in SeqIO.parse(filename, format, alphabet)
            ]

        with warnings.catch_warnings():
            if "_alt_index_" in filename:
                # BiopythonParserWarning: Could not parse the SFF index:
                # Unknown magic number b'.diy' in SFF index header:
                # b'.diy1.00'
                warnings.simplefilter('ignore', BiopythonParserWarning)

            rec_dict = SeqIO.index(filename, format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            del rec_dict

            if not sqlite3:
                return

            # In memory,
            # note here give filenames as list of strings
            rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            del rec_dict

            # check error conditions
            self.assertRaises(ValueError,
                              SeqIO.index_db,
                              ":memory:",
                              format="dummy")
            self.assertRaises(ValueError,
                              SeqIO.index_db,
                              ":memory:",
                              filenames=["dummy"])

            # Saving to file...
            index_tmp = self.index_tmp
            if os.path.isfile(index_tmp):
                os.remove(index_tmp)

            # To disk,
            # note here we give the filename as a single string
            # to confirm that works too (convience feature).
            rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload it...
            rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload without passing filenames and format
            # and switch directory to check  paths still work
            index_tmp = os.path.abspath(index_tmp)
            os.chdir(os.path.dirname(filename))
            rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            os.remove(index_tmp)
Example #35
0
    def get_raw_check(self, filename, format, alphabet, comp):
        # Also checking the key_function here
        if comp:
            h = gzip.open(filename, "rb")
            raw_file = h.read()
            h.close()
            h = gzip_open(filename, format)
            id_list = [
                rec.id.lower() for rec in SeqIO.parse(h, format, alphabet)
            ]
            h.close()
        else:
            h = open(filename, "rb")
            raw_file = h.read()
            h.close()
            id_list = [
                rec.id.lower()
                for rec in SeqIO.parse(filename, format, alphabet)
            ]

        if format in ["sff"]:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', BiopythonParserWarning)
                rec_dict = SeqIO.index(filename,
                                       format,
                                       alphabet,
                                       key_function=lambda x: x.lower())
                if sqlite3:
                    rec_dict_db = SeqIO.index_db(
                        ":memory:",
                        filename,
                        format,
                        alphabet,
                        key_function=lambda x: x.lower())
        else:
            rec_dict = SeqIO.index(filename,
                                   format,
                                   alphabet,
                                   key_function=lambda x: x.lower())
            if sqlite3:
                rec_dict_db = SeqIO.index_db(":memory:",
                                             filename,
                                             format,
                                             alphabet,
                                             key_function=lambda x: x.lower())

        self.assertEqual(set(id_list), set(rec_dict))
        if sqlite3:
            self.assertEqual(set(id_list), set(rec_dict_db))
        self.assertEqual(len(id_list), len(rec_dict))
        for key in id_list:
            self.assertIn(key, rec_dict)
            self.assertEqual(key, rec_dict[key].id.lower())
            self.assertEqual(key, rec_dict.get(key).id.lower())
            raw = rec_dict.get_raw(key)
            self.assertTrue(isinstance(raw, bytes),
                            "Didn't get bytes from %s get_raw" % format)
            self.assertTrue(raw.strip())
            self.assertIn(raw, raw_file)

            if sqlite3:
                raw_db = rec_dict_db.get_raw(key)
                # Via index using format-specific get_raw which scans the file,
                # Via index_db in general using raw length found when indexing.
                self.assertEqual(
                    raw, raw_db,
                    "index and index_db .get_raw() different for %s" % format)

            rec1 = rec_dict[key]
            # Following isn't very elegant, but it lets me test the
            # __getitem__ SFF code is working.
            if format in SeqIO._BinaryFormats:
                handle = BytesIO(raw)
            else:
                handle = StringIO(_bytes_to_string(raw))
            if format == "sff":
                rec2 = SeqIO.SffIO._sff_read_seq_record(
                    handle,
                    rec_dict._proxy._flows_per_read,
                    rec_dict._proxy._flow_chars,
                    rec_dict._proxy._key_sequence,
                    rec_dict._proxy._alphabet,
                    trim=False)
            elif format == "sff-trim":
                rec2 = SeqIO.SffIO._sff_read_seq_record(
                    handle,
                    rec_dict._proxy._flows_per_read,
                    rec_dict._proxy._flow_chars,
                    rec_dict._proxy._key_sequence,
                    rec_dict._proxy._alphabet,
                    trim=True)
            elif format == "uniprot-xml":
                self.assertTrue(raw.startswith(b"<entry "))
                self.assertTrue(raw.endswith(b"</entry>"))
                # Currently the __getitem__ method uses this
                # trick too, but we hope to fix that later
                raw = """<?xml version='1.0' encoding='UTF-8'?>
                <uniprot xmlns="http://uniprot.org/uniprot"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                xsi:schemaLocation="http://uniprot.org/uniprot
                http://www.uniprot.org/support/docs/uniprot.xsd">
                %s
                </uniprot>
                """ % _bytes_to_string(raw)
                handle = StringIO(raw)
                rec2 = SeqIO.read(handle, format, alphabet)
            else:
                rec2 = SeqIO.read(handle, format, alphabet)
            self.assertEqual(True, compare_record(rec1, rec2))
        rec_dict.close()
        del rec_dict
Example #36
0
    def simple_check(self, filename, fmt, alphabet, comp):
        """Check indexing (without a key function)."""
        msg = "Test failure parsing file %s with format %s" % (filename, fmt)
        if comp:
            mode = "r" + self.get_mode(fmt)
            with gzip.open(filename, mode) as handle:
                id_list = [rec.id for rec in SeqIO.parse(handle, fmt, alphabet)]
        else:
            id_list = [rec.id for rec in SeqIO.parse(filename, fmt, alphabet)]

        with warnings.catch_warnings():
            if "_alt_index_" in filename:
                # BiopythonParserWarning: Could not parse the SFF index:
                # Unknown magic number b'.diy' in SFF index header:
                # b'.diy1.00'
                warnings.simplefilter("ignore", BiopythonParserWarning)

            rec_dict = SeqIO.index(filename, fmt, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list, msg=msg)
            rec_dict.close()

            if not sqlite3:
                return

            # In memory,
            # note here give filenames as list of strings
            rec_dict = SeqIO.index_db(":memory:", [filename], fmt, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list, msg=msg)
            rec_dict.close()

            # check error conditions
            with self.assertRaises(ValueError, msg=msg):
                SeqIO.index_db(":memory:", format="dummy")
            with self.assertRaises(ValueError, msg=msg):
                SeqIO.index_db(":memory:", filenames=["dummy"])

            # Saving to file...
            index_tmp = self.index_tmp
            if os.path.isfile(index_tmp):
                os.remove(index_tmp)

            # To disk,
            # note here we give the filename as a single string
            # to confirm that works too.
            rec_dict = SeqIO.index_db(index_tmp, filename, fmt, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list, msg=msg)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy

            # Now reload it...
            rec_dict = SeqIO.index_db(index_tmp, [filename], fmt, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list, msg=msg)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy

            # Now reload without passing filenames and format
            # and switch directory to check  paths still work
            index_tmp = os.path.abspath(index_tmp)
            os.chdir(os.path.dirname(filename))
            try:
                rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet)
            finally:
                os.chdir(CUR_DIR)
            self.check_dict_methods(rec_dict, id_list, id_list, msg=msg)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy

            os.remove(index_tmp)
Example #37
0
    def key_check(self, filename, fmt, alphabet, comp):
        """Check indexing with a key function."""
        msg = "Test failure parsing file %s with format %s" % (filename, fmt)
        if comp:
            mode = "r" + self.get_mode(fmt)
            with gzip.open(filename, mode) as handle:
                id_list = [rec.id for rec in SeqIO.parse(handle, fmt, alphabet)]
        else:
            id_list = [rec.id for rec in SeqIO.parse(filename, fmt, alphabet)]

        key_list = [self.add_prefix(id) for id in id_list]

        with warnings.catch_warnings():
            if "_alt_index_" in filename:
                # BiopythonParserWarning: Could not parse the SFF index:
                # Unknown magic number b'.diy' in SFF index header:
                # b'.diy1.00'
                warnings.simplefilter("ignore", BiopythonParserWarning)

            rec_dict = SeqIO.index(filename, fmt, alphabet, self.add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list, msg=msg)
            rec_dict.close()

            if not sqlite3:
                return

            # In memory,
            rec_dict = SeqIO.index_db(
                ":memory:", [filename], fmt, alphabet, self.add_prefix
            )
            self.check_dict_methods(rec_dict, key_list, id_list, msg=msg)
            # check error conditions
            with self.assertRaises(ValueError, msg=msg):
                SeqIO.index_db(":memory:", format="dummy", key_function=self.add_prefix)
            with self.assertRaises(ValueError, msg=msg):
                SeqIO.index_db(
                    ":memory:", filenames=["dummy"], key_function=self.add_prefix
                )
            rec_dict.close()

            # Saving to file...
            index_tmp = filename + ".key.idx"
            if os.path.isfile(index_tmp):
                os.remove(index_tmp)
            rec_dict = SeqIO.index_db(
                index_tmp, [filename], fmt, alphabet, self.add_prefix
            )
            self.check_dict_methods(rec_dict, key_list, id_list, msg=msg)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy

            # Now reload it...
            rec_dict = SeqIO.index_db(
                index_tmp, [filename], fmt, alphabet, self.add_prefix
            )
            self.check_dict_methods(rec_dict, key_list, id_list, msg=msg)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy

            # Now reload without passing filenames and format
            rec_dict = SeqIO.index_db(
                index_tmp, alphabet=alphabet, key_function=self.add_prefix
            )
            self.check_dict_methods(rec_dict, key_list, id_list, msg=msg)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            os.remove(index_tmp)
Example #38
0
import sys
import tempfile

# set up log
logging.basicConfig(level=logging.DEBUG)

# debug biopython issue
logging.debug('sys.version')
logging.debug(sys.version)
logging.debug('sqlite3.version')
logging.debug(sqlite3.version)
logging.debug('platform.python_implementation()')
logging.debug(platform.python_implementation())
logging.debug('platform.platform()')
logging.debug(platform.platform())
logging.debug('Bio.__version__')
logging.debug(Bio.__version__)
logging.debug('os.environ')
logging.debug(os.environ)

read_file = '/r1.fq'

outdir = tempfile.mkdtemp(dir=os.environ.get('PWD'))
db_file = os.path.join(outdir, 'r1.idx')

try:
    read_index = SeqIO.index_db(db_file, read_file, 'fastq')
except Exception as e:
    logging.exception('')
    raise e
def gfftosequence(gff, sequencedir):

    seqdirectory = os.path.abspath(sequencedir)
    seqfiles = [
        os.path.join(seqdirectory, file) for file in os.listdir(seqdirectory)
    ]

    #Make index of fasta files
    print 'Indexing sequences...'
    seqdb = SeqIO.index_db('seqdb.idx', seqfiles, 'fasta')
    print '{0} sequences indexed'.format(len(seqdb))

    #Make gff database
    gff_fn = gff
    db_fn = os.path.basename(gff_fn) + '.db'

    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn)

    db = gffutils.FeatureDB(db_fn)

    seqs = {}  #dictionary where key is ID and value is sequence
    counter = 0

    UTRs = db.features_of_type('5UTR')

    for UTR in UTRs:
        counter += 1
        chrm = UTR.chrom
        strand = UTR.strand
        ID = UTR.attributes['ID']
        exoncoords = []
        UTRsequence = ''
        for exon in db.children(UTR, featuretype='exon'):
            exoncoords.append([exon.start, exon.stop])

        number_of_exons = len(exoncoords)

        if strand == '+':
            for idx, exonstartstop in enumerate(exoncoords):
                if idx + 1 < number_of_exons:
                    UTRsequence += seqdb[chrm].seq[exonstartstop[0] -
                                                   1:exonstartstop[1]].upper()
                elif idx + 1 == number_of_exons:  #if at the last exon
                    #To check for start codon, take three more nucleotides (change [1]-1 to [1]+2)
                    UTRsequence += seqdb[chrm].seq[exonstartstop[0] -
                                                   1:exonstartstop[1] -
                                                   1].upper()

        elif strand == '-':
            for idx, exonstartstop in enumerate(reversed(
                    exoncoords)):  #reverse exon order since this is - strand
                if idx + 1 < number_of_exons:
                    UTRsequence += seqdb[chrm].seq[exonstartstop[0] -
                                                   1:exonstartstop[1]].upper(
                                                   ).reverse_complement()
                elif idx + 1 == number_of_exons:  #if at the last exon
                    #To check for start codon, take three more nucleotides (change [0] to [0]-3)
                    UTRsequence += seqdb[chrm].seq[
                        exonstartstop[0]:exonstartstop[1]].upper(
                        ).reverse_complement()

        seqs[ID] = UTRsequence

        if counter <= 50 and counter % 10 == 0:
            print 'Retrieving sequence %i' % (counter)
        elif counter > 50 and counter % 50 == 0:
            print 'Retrieving sequence %i' % (counter)

    print 'Retrieved {0} sequences.'.format(len(seqs))

    os.remove(db_fn)
    os.remove('seqdb.idx')

    return seqs
Example #40
0
def go(input, output, raw_reads, linear_refs, circular_refs, coverage_file):

    if raw_reads:
        assert os.path.isfile(raw_reads)
        idx = raw_reads + ".idx"
        if os.path.isfile(idx):
            sys.stderr.write("Loading %s\n" % idx)
            raw = SeqIO.index_db(idx)
        else:
            sys.stderr.write("Creating %s\n" % idx)
            raw = SeqIO.index_db(idx, raw_reads, "fastq")
        sys.stderr.write(
            "Have %i raw reads (used for unmapped partners)\n" % len(raw))
    else:
        raw = dict()

    ref_len_linear = dict()
    if linear_refs:
        for f in linear_refs:
            ref_len_linear.update(get_fasta_ids_and_lengths(f))
    ref_len_circles = dict()
    if circular_refs:
        for f in circular_refs:
            ref_len_circles.update(get_fasta_ids_and_lengths(f))
    # print ref_len_circles

    if input is None:
        input_handle = sys.stdin
    elif isinstance(input, basestring):
        input_handle = open(input)
    else:
        input_handle = input

    if output is None:
        output_handle = sys.stdout
    elif isinstance(output, basestring):
        output_handle = open(output, "w")
    else:
        output_handle = output

    line = input_handle.readline()
    while line[0] == "@":
        # SAM header
        if line[0:4] == "@SQ\t":
            parts = line[4:].strip().split("\t")
            rname = None
            length = None
            for p in parts:
                if p.startswith("SN:"):
                    rname = p[3:]
                if p.startswith("LN:"):
                    length = int(p[3:])
            if rname in ref_len_linear:
                assert length == ref_len_linear[rname]
                # print "Found @SQ line for linear reference %s" % rname
            elif rname in ref_len_circles:
                if length == 2 * ref_len_circles[rname]:
                    # Return the length to its correct value (should have
                    # happened already)
                    sys.stderr.write("Fixing @SQ line for %s, length %i --> %i\n" %
                                     (rname, length, ref_len_circles[rname]))
                    line = "@SQ\tSN:%s\tLN:%i\n" % (
                        rname, ref_len_circles[rname])
                else:
                    assert length == ref_len_circles[rname]
            elif rname is None:
                sys_exit("Bad @SQ line:\n%s" % line)
            else:
                sys_exit("This reference was not given!:\n%s" % line)
        output_handle.write(line)
        line = input_handle.readline()

    global solo0, solo1, solo2, solo12
    solo0 = solo1 = solo2 = solo12 = 0

    global coverage
    coverage = dict()
    if coverage_file:
        import numpy
        for lengths in [ref_len_linear, ref_len_circles]:
            for ref, length in lengths.iteritems():
                coverage[ref] = numpy.zeros((5, length), numpy.float)

    cur_read_name = None
    reads = set()
    while line:
        # SAM read
        qname, flag, rname, pos, rest = line.split("\t", 4)
        flag = int(flag)
        if " " in qname:
            # Stupid mrfast!
            qname = qname.split(None, 1)[0]
        if rname in ref_len_circles and pos != "0":
            length = ref_len_circles[rname]
            if length <= int(pos) - 1:
                sys_exit("Have POS %s yet length of %s is %i (circular)\n" %
                         (pos, rname, length))
        elif rname in ref_len_linear and pos != "0":
            length = ref_len_linear[rname]
            if length <= int(pos) - 1:
                sys_exit("Have POS %s yet length of %s is %i (linear)\n" %
                         (pos, rname, length))
        if qname[-2:] == "/1":
            qname = qname[:-2]
            frag = 1
        elif qname[-2:] == "/2":
            qname = qname[:-2]
            frag = 2
        elif not (flag & 0x1):
            frag = 0  # Single read
        elif flag & 0x40:
            frag = 1
        elif flag & 0x80:
            frag = 2
        else:
            frag = 0  # Assume unpaired
        if qname == cur_read_name:
            # Cache this, as a tuple - ordered to allow sorting on position:
            # Using a set will eliminate duplicates after adjusting POS
            reads.add((qname, frag, rname, pos, flag, rest))
        else:
            if coverage_file:
                count_coverage(coverage, reads)
            flush_cache(output_handle, reads, raw,
                        ref_len_linear, ref_len_circles)
            reads = set([(qname, frag, rname, pos, flag, rest)])
            cur_read_name = qname
        # Next line...
        line = input_handle.readline()

    if reads:
        if coverage_file:
            count_coverage(coverage, reads)
        flush_cache(output_handle, reads, raw, ref_len_linear, ref_len_circles)

    if isinstance(input, basestring):
        input_handle.close()
    if isinstance(output, basestring):
        output_handle.close()

    if coverage_file:
        handle = open(coverage_file, "w")
        for lengths in [ref_len_linear, ref_len_circles]:
            for ref, length in lengths.iteritems():
                handle.write(">%s length %i\n" % (ref, length))
                for row in coverage[ref]:
                    assert len(row) == length
                    handle.write("\t".join("%.1f" % v for v in row) + "\n")
        handle.close()
    sys.stderr.write("%i singletons; %i where only /1, %i where only /2, %i where both present\n" %
                     (solo0, solo1, solo2, solo12))
Example #41
0
#!/usr/bin/env python
import os
from Bio import SeqIO
from RouToolPa.Routines import FileRoutines

workdir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/"
data_dir = "/home/mahajrod/Genetics/Projects/nxf/nxf_arthropoda/data/"

os.chdir(workdir)

data_files = FileRoutines.make_list_of_path_to_files([data_dir])

record_dict = SeqIO.index_db("tmp.idx", data_files, format="genbank")

print("#organism\ttaxonomy\tregion_id\ttranscript_id\tproduct\texon_len")
for record_id in record_dict:
    for feature in record_dict[record_id].features:
        if feature.type == "mRNA":
            mRNA_string = ""
            mRNA_string += "%s" % record_dict[record_id].annotations["organism"]
            mRNA_string += "\t%s" % (";".join(
                record_dict[record_id].annotations["taxonomy"]))
            mRNA_string += "\t%s" % record_id
            mRNA_string += "\t%s" % (feature.qualifiers["transcript_id"][0]
                                     if "transcript_id" in feature.qualifiers
                                     else ".")
            mRNA_string += "\t%s" % (feature.qualifiers["product"][0] if
                                     "product" in feature.qualifiers else ".")

            location_lenths = []
Example #42
0
#!/usr/bin/env python

import numpy as np
import sys
from Bio import SeqIO

paf_file = sys.argv[1]
unitigs = sys.argv[2]
output_stats = sys.argv[3]
output_file = sys.argv[4]

unitigs_corrected = open(output_file, 'w')
stats_file = open(output_stats, 'a')

# open up sequence map for illumina
illumina_dict = SeqIO.index_db(unitigs + ".idx", unitigs, "fasta")


def find_sequence_illumina(seqid, start, end):
    return illumina_dict[seqid].seq[start:end + 1]


def write_sequence(f, seq):
    for i in range(0, len(seq), 60):
        f.write(str(seq[i:i + 60]) + "\n")


############ read input paf file #######################################################################################
mapping_freq = {}
nano_lengths = {}
Example #43
0
 def test_old_files(self):
     """Load existing index with correct files."""
     d = SeqIO.index_db(
         "Roche/triple_sff.idx",
         ["E3MFGYR02_no_manifest.sff", "greek.sff", "paired.sff"])
     self.assertEqual(54, len(d))
Example #44
0
    def get_raw_check(self, filename, fmt, alphabet, comp):
        # Also checking the key_function here
        msg = "Test failure parsing file %s with format %s" % (filename, fmt)
        if comp:
            with gzip.open(filename, "rb") as handle:
                raw_file = handle.read()
            mode = "r" + self.get_mode(fmt)
            with gzip.open(filename, mode) as handle:
                id_list = [rec.id.lower() for rec in SeqIO.parse(handle, fmt, alphabet)]
        else:
            with open(filename, "rb") as handle:
                raw_file = handle.read()
            id_list = [rec.id.lower() for rec in SeqIO.parse(filename, fmt, alphabet)]

        if fmt in ["sff"]:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", BiopythonParserWarning)
                rec_dict = SeqIO.index(filename, fmt, alphabet, key_function=str.lower)
                if sqlite3:
                    rec_dict_db = SeqIO.index_db(
                        ":memory:", filename, fmt, alphabet, key_function=str.lower,
                    )
        else:
            rec_dict = SeqIO.index(filename, fmt, alphabet, key_function=str.lower)
            if sqlite3:
                rec_dict_db = SeqIO.index_db(
                    ":memory:", filename, fmt, alphabet, key_function=str.lower,
                )

        self.assertEqual(set(id_list), set(rec_dict), msg=msg)
        if sqlite3:
            self.assertEqual(set(id_list), set(rec_dict_db), msg=msg)
        self.assertEqual(len(id_list), len(rec_dict), msg=msg)
        for key in id_list:
            self.assertIn(key, rec_dict, msg=msg)
            self.assertEqual(key, rec_dict[key].id.lower(), msg=msg)
            self.assertEqual(key, rec_dict.get(key).id.lower(), msg=msg)
            raw = rec_dict.get_raw(key)
            self.assertIsInstance(raw, bytes, msg=msg)
            self.assertTrue(raw.strip(), msg=msg)
            self.assertIn(raw, raw_file, msg=msg)

            if sqlite3:
                raw_db = rec_dict_db.get_raw(key)
                # Via index using format-specific get_raw which scans the file,
                # Via index_db in general using raw length found when indexing.
                self.assertEqual(raw, raw_db, msg=msg)

            rec1 = rec_dict[key]
            # Following isn't very elegant, but it lets me test the
            # __getitem__ SFF code is working.
            mode = self.get_mode(fmt)
            if mode == "b":
                handle = BytesIO(raw)
            elif mode == "t":
                handle = StringIO(raw.decode())
            else:
                raise RuntimeError("Unexpected mode %s" % mode)
            if fmt == "sff":
                rec2 = SeqIO.SffIO._sff_read_seq_record(
                    handle,
                    rec_dict._proxy._flows_per_read,
                    rec_dict._proxy._flow_chars,
                    rec_dict._proxy._key_sequence,
                    rec_dict._proxy._alphabet,
                    trim=False,
                )
            elif fmt == "sff-trim":
                rec2 = SeqIO.SffIO._sff_read_seq_record(
                    handle,
                    rec_dict._proxy._flows_per_read,
                    rec_dict._proxy._flow_chars,
                    rec_dict._proxy._key_sequence,
                    rec_dict._proxy._alphabet,
                    trim=True,
                )
            elif fmt == "uniprot-xml":
                self.assertTrue(raw.startswith(b"<entry "), msg=msg)
                self.assertTrue(raw.endswith(b"</entry>"), msg=msg)
                # Currently the __getitem__ method uses this
                # trick too, but we hope to fix that later
                raw = (
                    """<?xml version='1.0' encoding='UTF-8'?>
                <uniprot xmlns="http://uniprot.org/uniprot"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                xsi:schemaLocation="http://uniprot.org/uniprot
                http://www.uniprot.org/support/docs/uniprot.xsd">
                %s
                </uniprot>
                """
                    % raw.decode()
                )
                handle = StringIO(raw)
                rec2 = SeqIO.read(handle, fmt, alphabet)
            else:
                rec2 = SeqIO.read(handle, fmt, alphabet)
            self.assertEqual(True, compare_record(rec1, rec2))
        rec_dict.close()
        del rec_dict
Example #45
0
 def test_old_rel(self):
     """Load existing index (with relative paths) with no options (from parent directory)."""
     d = SeqIO.index_db("Roche/triple_sff_rel_paths.idx")
     self.assertEqual(54, len(d))
     self.assertEqual(395, len(d["alpha"]))
Example #46
0
 def test_order_index_db(self):
     """Check index_db preserves ordering indexed file."""
     d = SeqIO.index_db(":memory:", [self.f], "fasta")
     self.assertEqual(self.ids, list(d))
Example #47
0
    def key_check(self, filename, format, alphabet, comp):
        """Check indexing with a key function."""
        if comp:
            h = gzip_open(filename, format)
            id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            id_list = [
                rec.id for rec in SeqIO.parse(filename, format, alphabet)
            ]

        key_list = [add_prefix(id) for id in id_list]

        with warnings.catch_warnings():
            if "_alt_index_" in filename:
                # BiopythonParserWarning: Could not parse the SFF index:
                # Unknown magic number b'.diy' in SFF index header:
                # b'.diy1.00'
                warnings.simplefilter('ignore', BiopythonParserWarning)

            rec_dict = SeqIO.index(filename, format, alphabet, add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            del rec_dict

            if not sqlite3:
                return

            # In memory,
            rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet,
                                      add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            # check error conditions
            self.assertRaises(ValueError,
                              SeqIO.index_db,
                              ":memory:",
                              format="dummy",
                              key_function=add_prefix)
            self.assertRaises(ValueError,
                              SeqIO.index_db,
                              ":memory:",
                              filenames=["dummy"],
                              key_function=add_prefix)
            rec_dict.close()
            del rec_dict

            # Saving to file...
            index_tmp = filename + ".key.idx"
            if os.path.isfile(index_tmp):
                os.remove(index_tmp)
            rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                      add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload it...
            rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet,
                                      add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload without passing filenames and format
            rec_dict = SeqIO.index_db(index_tmp,
                                      alphabet=alphabet,
                                      key_function=add_prefix)
            self.check_dict_methods(rec_dict, key_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict
            os.remove(index_tmp)
def make_index(input_file, output_file):
    SeqIO.index_db(output_file, input_file, 'fasta')
def index_gb():
    print "indexing genbank file"
    gb_index = SeqIO.index_db(index_file, gb_file, "genbank")
    print "done indexing file\n"
    return gb_index
Example #50
0
### Set up the logger
# create logger with 'spam_application'
logger = logging.getLogger('BuildBioIndex')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)  #WARN
# create formatter and add it to the handlers
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(ch)
logger.debug(" ".join(sys.argv))

if len(sys.argv) != 3:
    logger.error("2 arguments are required (index, file fasta file")
    sys.exit(1)

index_file = sys.argv[1]
fasta_file = sys.argv[2]

logger.debug("Index file: %s", index_file)
logger.debug("Fasta file: %s", fasta_file)

if os.path.isfile(fasta_file):
    IndexDB = SeqIO.index_db(index_file, fasta_file, "fasta")
else:
    logger.error("Fasta file (%s) is not a file", fasta_file)
    sys.exit(1)
Example #51
0
    def simple_check(self, filename, format, alphabet, comp):
        """Check indexing (without a key function)."""
        if comp:
            h = gzip_open(filename, format)
            id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            id_list = [
                rec.id for rec in SeqIO.parse(filename, format, alphabet)
            ]

        rec_dict = SeqIO.index(filename, format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        del rec_dict

        if not sqlite3:
            return

        #In memory,
        #note here give filenames as list of strings
        rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        del rec_dict

        #check error conditions
        self.assertRaises(ValueError,
                          SeqIO.index_db,
                          ":memory:",
                          format="dummy")
        self.assertRaises(ValueError,
                          SeqIO.index_db,
                          ":memory:",
                          filenames=["dummy"])

        #Saving to file...
        index_tmp = self.index_tmp
        if os.path.isfile(index_tmp):
            os.remove(index_tmp)

        #To disk,
        #note here we give the filename as a single string
        #to confirm that works too (convience feature).
        rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict

        #Now reload it...
        rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict

        #Now reload without passing filenames and format
        rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet)
        self.check_dict_methods(rec_dict, id_list, id_list)
        rec_dict.close()
        rec_dict._con.close()  # hack for PyPy
        del rec_dict
        os.remove(index_tmp)
                    type=FileRoutines.make_list_of_path_to_files_from_string,
                    help="Comma-separated list of input files/directories with sequences")
parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path,
                    help="Directory to output groups_of sequences")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input and output files. Allowed formats genbank, fasta(default)")
parser.add_argument("-e", "--extension", action="store", dest="extension",
                    help="Extension of output files. Default: equal to -f")
parser.add_argument("-d", "--id_file", action="store", dest="id_file",
                    help="File with groups of sequences to extract(.fam file).")

args = parser.parse_args()

FileRoutines.safe_mkdir(args.output)
args.extension = args.extension if args.extension else args.format
tmp_index_file = "temp.idx"

#id_list = read_ids(args.id_file)
id_list = IdSet(filename=args.id_file)

sequence_groups_id = SynDict()
sequence_groups_id.read(args.id_file, split_values=True)
#print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format)
for group in sequence_groups_id:
    SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group],
                                                        verbose=True),
                "%s%s.%s" % (args.output, group, args.extension), format=args.format)

os.remove(tmp_index_file)
Example #53
0
 def test_old(self):
     """Load existing index with no options (from parent directory)."""
     d = SeqIO.index_db("Roche/triple_sff.idx")
     self.assertEqual(54, len(d))
     self.assertRaises(FileNotFoundError, d.get_raw, "alpha")
Example #54
0
                    "--dont_extract_kmer_list",
                    action="store_true",
                    dest="dont_extract_kmer_list",
                    help="Don't extract kmer list")

args = parser.parse_args()

args.input = make_list_of_path_to_files(args.input)
if args.count_both_strands and args.add_rev_com:
    raise ValueError(
        "Options -b/--count_both_strands and -r/--add_reverse_complement are not compatible"
    )

if args.add_rev_com:
    file_with_rev_com = args.base_prefix + "_with_rev_com.fasta"
    record_dict = SeqIO.index_db("temp_index.idx", args.input, format="fasta")
    SeqIO.write(rev_com_generator(record_dict, yield_original_record=True),
                file_with_rev_com, "fasta")
    args.base_prefix += "_with_rev_com"

base_file = "%s_%i_mer.jf" % (args.base_prefix, args.kmer_length)
kmer_table_file = "%s_%i_mer.counts" % (args.base_prefix, args.kmer_length)
kmer_file = "%s_%i_mer.kmer" % (args.base_prefix, args.kmer_length)

Jellyfish.threads = args.threads
Jellyfish.path = args.jellyfish_path if args.jellyfish_path else ""
Jellyfish.count(args.input if not args.add_rev_com else file_with_rev_com,
                base_file,
                kmer_length=args.kmer_length,
                hash_size=args.hash_size,
                count_both_strands=args.count_both_strands)
Example #55
0
 def test_old_format(self):
     """Load existing index with correct format."""
     d = SeqIO.index_db("Roche/triple_sff.idx", format="sff")
     self.assertEqual(54, len(d))
Example #56
0
 def test_old(self):
     """Load existing index with no options."""
     d = SeqIO.index_db("Roche/triple_sff.idx")
     self.assertEqual(54, len(d))
Example #57
0
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="format of file with sequences - default: fasta.")
parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    help="file with sequences")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    default="out.t",
                    help="output file - default: out.t.")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

record_dict = SeqIO.index_db("temp_index.idx", [args.input],
                             format=args.format)
lengths_dict = SequenceRoutines.get_lengths(record_dict, out_file=out_fd)
print("Longest sequence: %i" % max(lengths_dict.values()))
print("Shortest sequence: %i" % min(lengths_dict.values()))
print("Total length: %i" % sum(lengths_dict.values()))
os.remove("temp_index.idx")
Example #58
0
    def distance_matrix_worker_old(self, seq_keys, length_threshold, dist_matrix, already_compared, lock, process_num, gb_dir):
        """
        Worker process for make_distance_matrix(). Takes a list "already_compared" of sequences that have
        already had all pairwise comparisons. Each worker process will work making pairwise comparisons
        for a different sequence, adding them to the "already_compared" list as they are completed.
        """
        # each process must load its own sqlite gb
        gb = SeqIO.index_db(gb_dir + "/gb.idx")
        process_num = str(process_num)
        i = 0
        color = Color()
        for key in seq_keys:
            # check whether another process is already comparing this row
            compare_row = False
            with lock:
                if key not in already_compared:
                    already_compared.append(key)
                    compare_row = True
            if compare_row:
                # get the sequence record to compare
                record1 = gb[key]
                output_handle = open('query' + process_num + '.fasta', 'w')
                SeqIO.write(record1, output_handle, 'fasta')
                output_handle.close()
                j = 0
                for key2 in seq_keys:
                    # only calculate e-values for pairs that have not yet been compared
                    if dist_matrix[i][j] == 99:
                        if key == key2:
                            row = dist_matrix[i]
                            row[j] = 0.0
                            dist_matrix[i] = row
                        # check sequence lengths
                        else:
                            # print("proc # = "+process_num+" i = "+str(i)+ " j = "+str(j))
                            record2 = gb[key2]
                            length1 = len(record1.seq)
                            length2 = len(record2.seq)
                            # set distance to 50.0 if length similarity threshold not met
                            if (length2 < length1 * (1 + float(length_threshold))) and (length2 > length1 * (1 - float(length_threshold))):
                                row = dist_matrix[i]
                                row[j] = 50.0
                                dist_matrix[i] = row
                                row = dist_matrix[j]
                                row[i] = 50.0
                                dist_matrix[j] = row
                            else:
                                # do the blast comparison
                                output_handle = open('query' + process_num + '.fasta', 'w')
                                SeqIO.write(record2, output_handle, 'fasta')
                                output_handle.close()

                                blastn_cmd = NcbiblastnCommandline(query='query' + process_num + '.fasta', subject='subject' + process_num + \
                                    '.fasta', out='blast' + process_num + '.xml', outfmt=5)
                                stdout, stderr = blastn_cmd()
                                blastn_xml = open('blast' + process_num + '.xml', 'r')
                                blast_records = NCBIXML.parse(blastn_xml)

                                for blast_record in blast_records:
                                    if blast_record.alignments:
                                        if blast_record.alignments[0].hsps:
                                            # blast hit found, set distance to e-value
                                            row = dist_matrix[i]
                                            row[j] = blast_record.alignments[0].hsps[0].expect
                                            dist_matrix[i] = row
                                            row = dist_matrix[j]
                                            row[i] = blast_record.alignments[0].hsps[0].expect
                                            dist_matrix[j] = row
                                    else:
                                        # no blast hit found, set distance to default 10.0
                                        row = dist_matrix[i]
                                        row[j] = 10.0
                                        dist_matrix[i] = row
                                        row = dist_matrix[j]
                                        row[i] = 10.0
                                        dist_matrix[j] = row
                                blastn_xml.close()
                    j += 1
            i += 1
            # update status
            percent = str(round(100 * len(already_compared)/float(len(seq_keys)), 2))
            sys.stdout.write('\r' + color.blue + 'Completed: ' + color.red + str(len(already_compared)) + '/' + str(len(seq_keys)) + ' (' + percent + '%)' + color.done)
            sys.stdout.flush()
        # done looping through all keys, now clean up
        os.remove("blast" + process_num + ".xml")
        os.remove("query" + process_num + ".fasta")
        os.remove("subject" + process_num + ".fasta")
Example #59
0
    def get_raw_check(self, filename, format, alphabet, comp):
        # Also checking the key_function here
        if comp:
            h = gzip.open(filename, "rb")
            raw_file = h.read()
            h.close()
            h = gzip_open(filename, format)
            id_list = [rec.id.lower() for rec in
                       SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            h = open(filename, "rb")
            raw_file = h.read()
            h.close()
            id_list = [rec.id.lower() for rec in
                       SeqIO.parse(filename, format, alphabet)]

        if format in ["sff"]:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', BiopythonParserWarning)
                rec_dict = SeqIO.index(filename, format, alphabet,
                                       key_function=lambda x: x.lower())
                rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet,
                                             key_function=lambda x: x.lower())
        else:
            rec_dict = SeqIO.index(filename, format, alphabet,
                                   key_function=lambda x: x.lower())
            rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet,
                                         key_function=lambda x: x.lower())

        self.assertEqual(set(id_list), set(rec_dict))
        self.assertEqual(set(id_list), set(rec_dict_db))
        self.assertEqual(len(id_list), len(rec_dict))
        for key in id_list:
            self.assertIn(key, rec_dict)
            self.assertEqual(key, rec_dict[key].id.lower())
            self.assertEqual(key, rec_dict.get(key).id.lower())
            raw = rec_dict.get_raw(key)
            self.assertTrue(isinstance(raw, bytes),
                            "Didn't get bytes from %s get_raw" % format)
            self.assertTrue(raw.strip())
            self.assertIn(raw, raw_file)

            raw_db = rec_dict_db.get_raw(key)
            # Via index using format-specific get_raw which scans the file,
            # Via index_db in general using raw length found when indexing.
            self.assertEqual(raw, raw_db,
                             "index and index_db .get_raw() different for %s" % format)

            rec1 = rec_dict[key]
            # Following isn't very elegant, but it lets me test the
            # __getitem__ SFF code is working.
            if format in SeqIO._BinaryFormats:
                handle = BytesIO(raw)
            else:
                handle = StringIO(_bytes_to_string(raw))
            if format == "sff":
                rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                            rec_dict._proxy._flows_per_read,
                            rec_dict._proxy._flow_chars,
                            rec_dict._proxy._key_sequence,
                            rec_dict._proxy._alphabet,
                            trim=False)
            elif format == "sff-trim":
                rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                            rec_dict._proxy._flows_per_read,
                            rec_dict._proxy._flow_chars,
                            rec_dict._proxy._key_sequence,
                            rec_dict._proxy._alphabet,
                            trim=True)
            elif format == "uniprot-xml":
                self.assertTrue(raw.startswith(b"<entry "))
                self.assertTrue(raw.endswith(b"</entry>"))
                # Currently the __getitem__ method uses this
                # trick too, but we hope to fix that later
                raw = """<?xml version='1.0' encoding='UTF-8'?>
                <uniprot xmlns="http://uniprot.org/uniprot"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                xsi:schemaLocation="http://uniprot.org/uniprot
                http://www.uniprot.org/support/docs/uniprot.xsd">
                %s
                </uniprot>
                """ % _bytes_to_string(raw)
                handle = StringIO(raw)
                rec2 = SeqIO.read(handle, format, alphabet)
            else:
                rec2 = SeqIO.read(handle, format, alphabet)
            self.assertEqual(True, compare_record(rec1, rec2))
        rec_dict.close()
        del rec_dict
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Author: Kevin Lamkiewicz
# Email: [email protected]

from Bio import SeqIO
import time
import sys
import pickle
import glob
import re

files = glob.glob(f"{sys.argv[1]}/gbvrl*.seq")
gb_vrl = SeqIO.index_db(f"{sys.argv[1]}/gbvrl.idx", files, "genbank")

countryRegex = re.compile(r'country="([^"]+)"')
accessionDateRegex = re.compile(r'collection_date="([^"]+)"')

d_metaInformation = {}

for accession in gb_vrl:
    genbankEntry = gb_vrl.get_raw(accession).decode("utf-8")

    country = re.findall(countryRegex, genbankEntry)
    if country:
        country = country[0].replace('\n', ' ').replace(' ', '_')
    else:
        country = '--'

    accessionDate = re.findall(accessionDateRegex, genbankEntry)