def nr_by_longest(handle, filetype='fasta', write=True): oldlist = SeqIO.parse(handle, filetype) seqdict = {} for seq in oldlist: if seq.seq == 'Sequenceunavailable': print('Seq Unavailable:\t', seq.name) continue try: seq.id, seq.description = seq.id.split('|')[0], seq.id.split('|')[1] except IndexError: seq.id, seq.description = seq.id.split(' ')[0], ''.join(seq.id.split('|')[1:len(seq.id.split('|'))]) assert seq.id != 'gi' or seq.id != 'emb' or seq.id != 'acc' if seq.id in seqdict: if len(seq) > len(seqdict[seq.id]): seqdict[seq.id] = seq else: continue else: seqdict[seq.id] = seq newlist = (seq for _, seq in seqdict.items()) if write: outhandle = 'nr_' + str(Path(handle).name) with Path(outhandle).open('w') as outf: SeqIO.write(newlist, outf, filetype) return newlist
def blast_prep(search_type, database, species, verbose, indent, db_loc): if database == 'auto' or database == 'auto-transcript': if verbose > 1: print('Blast type set to auto!', indent=indent) try: blast_db = get_searchdb(search_type=search_type, species=species, db_loc=db_loc, verbose=verbose, indent=indent + 1) except Exception: raise SearchError( 'No BLAST database was found for species {}!'.format( species)) elif isinstance(database, dict): try: blast_db = database[species] if verbose > 1: print('Using {} as BLAST database!'.format(blast_db), indent=indent) except KeyError: raise SearchError( 'No BLAST database was found for species {}!'.format( species)) elif isinstance(database, str) or isinstance(database, Path): blast_db = database else: raise SearchError('Invalid type given for database!') return blast_db
def blat_prep(database_port, species, verbose, indent): if isinstance(database_port, dict): try: blat_port = database_port[species] if verbose > 1: print('Using port {0} for gfServer of species {1}.'.format( blat_port, species), indent=indent) except KeyError: raise SearchError( 'No 2bit found for species {}!'.format(species)) elif isinstance(database_port, int): blat_port = database_port elif isinstance(database_port, str): try: blat_port = int(database_port) except ValueError: raise SearchError( 'Invalid option "{}" was passed to database_port! database_port must be ' 'either a dictionary of species-port pairs or an integer!'. format(database_port)) else: raise SearchError( 'Invalid option of type "{}" was passed to database_port! database_port must be ' 'either a dictionary of species-port pairs or an ' 'integer!'.format(str(type(database_port)))) return blat_port
def twobit(id_full, id_item, database, indent, verbose): seq = None itemsnotfound = None command = [ "twoBitToFa", '{0}:{1}'.format(database, id_full), '/dev/stdout' ] if verbose > 1: print('Command:', indent=indent) print(' '.join(command), indent=indent + 1) twobittofa_handle = subprocess.check_output(command, universal_newlines=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) if verbose > 1: print("TwoBitToFa type: ", type(twobittofa_handle), indent=indent) if type(twobittofa_handle) is str: seq_out = twobittofa_handle else: seq_out, seq_err = twobittofa_handle raise FetchSeqError(seq_err) if seq_out is not None: if verbose: print('Got sequence for ', id_full, indent=indent) if verbose > 3: print(str(seq_out).replace('\n', '\n' + '\t' * (indent + 1)), indent=indent + 1) with StringIO(seq_out) as output: seq = SeqIO.read(output, 'fasta') else: itemsnotfound = id_item return seq, itemsnotfound
def run(self): while True: next_task = self.task_queue.get() if next_task is None: if self.verbose: print('\tAll GetSeq tasks in this process are complete') self.task_queue.task_done() break answer = next_task(server=self.server, sub_db_name=self.sub_db_name) self.task_queue.task_done() self.result_queue.put(answer)
def cull_reciprocal_best_hit(recblast_out): """ returns a recblast_out container that only has the reciprocal best hits. :param recblast_out: :return: """ pat = re.compile('\|\[(.*?)\]\|') # regex for items in annotation if isinstance(recblast_out, list): rc_out_list = [] for index, rc in enumerate(recblast_out): rc_out_list.append(cull_reciprocal_best_hit(rc)) return rc_out_list else: # assert isinstance(recblast_out, RecBlastContainer), "Items must be RecBlastContainer Objects!" for species, rc_spec_rec in recblast_out.items(): # print('Species:\t', species, indent=0) for query, rc_rec in rc_spec_rec.items(): # print('Query:\t', query, indent=1) try: rc_out = rc_rec['recblast_results'] except KeyError: print('No entries in recblast_results for query {0} in species {1}'.format(query, species)) continue tmprecord = [] for record in rc_out: try: # print(record.description, indent=3) target_id, annotations = record.description.split('|-|') # print('Target ID:\t', target_id, indent=4) # print('Annotations:', annotations.lstrip('\t'), indent=4) except ValueError: print(record.description, indent=2) print('Could not unpack annotations!', indent=2) continue id_lst = pat.findall(annotations) # print('id_list:\t', id_lst, indent=4) if id_lst: if query in id_lst[0]: tmprecord.append(record) else: print("For query {0}, target {1} was not a reciprocal best hit!".format(query, target_id)) continue else: print('No annotations found for record {0} in species {1}, query {2}'.format(record.name, species, query)) continue recblast_out[species][query]['recblast_results'] = tmprecord return recblast_out
def __init__(self, file): transtab = str.maketrans('!@#$%^&*();:.,\'\"/\\?<>|[]{}-=+', '_____________________________') if isinstance(file, str): self.file = Path(file) assert self.file.exists(), file + ' is an invalid file path or does not exist!' assert self.file.is_file(), file + ' is not a valid file!' elif isinstance(file, Path): assert self.file.exists(), str(file) + ' is an invalid file path or does not exist!' assert self.file.is_file(), str(file) + ' is not a valid file!' else: raise TypeError('File must be either a str or Path object!') self.regions = [] with self.file.open() as f: for index, line in enumerate(f): line = line.strip() if index == 0: self.header = line.lstrip('# ') continue elif line.startswith('# region='): region = line.lstrip('# region=').translate(transtab) if getattr(self, region, None) is None: self.regions.append(region) setattr(self, region, []) continue elif line.startswith('#') and not line.startswith('# '): cnames = line.lstrip('#').translate(transtab) ColNames = namedtuple('ColNames', cnames.split('\t')) self.colnames = ColNames._fields continue elif line.startswith('# No data'): newitem = getattr(self, region, []) + [ColNames(*[None] * len(self.colnames))] setattr(self, region, newitem) continue else: try: newitem = getattr(self, region, []) + [ColNames(*line.split('\t'))] setattr(self, region, newitem) except NameError as err: raise NameError(str(err) + '\nParser encountered a line of data before either the column names ' 'or the genomic region was declared in the file!') except TypeError: print(line, file=sys.stderr) raise continue
def run(self): while True: fs_instance = self.id_queue.get() if fs_instance is None: self.id_queue.task_done() print('All FetchSeqs in Queue completed!', indent=self.indent) break try: id_item, seq, miss_items = fs_instance( passwd=self.passwd, id_type=self.id_type, driver=self.driver, user=self.user, host=self.host, database=self.database, database_path=self.database_path, delim=self.delim, server=self.server, version=self.version, add_length=self.add_length, species=self.species, source=self.source, verbose=self.verbose, n_threads=self.n_subthreads, indent=self.indent) except Exception as err: print('FetchSeq Error!') print(type(err), err) id_item, seq = ('', '') miss_items = [] self.id_queue.task_done() self.seq_out_queue.put(((id_item, seq), miss_items)) return
def format_range(seqrange, strand, addlength, indent, verbose): assert isinstance( addlength, tuple), "addlength was of type {}, must be a tuple!".format( type(addlength)) assert len( addlength ) == 2, "addlength must be a tuple of length 2! Received: {}".format( addlength) try: lextend = -int(addlength[0]) rextend = int(addlength[1]) except Exception as err: print(type(err), err) lextend = 0 rextend = 0 try: lrange = int(seqrange[0]) rrange = int(seqrange[1]) except Exception as err: print(type(err), err) lrange = 0 rrange = -1 if verbose > 1: print('Original range: {0}-{1}{2}'.format(lrange, rrange, strand), indent=indent) print( 'Adding {0} steps to the beginning and {1} steps to the end of the sequence!' .format(lextend, rextend), indent=indent) if lrange > rrange: strand = '+' if strand == '-' else '-' lrange = seqrange[1] rrange = seqrange[0] newrange = tuple( map(lambda x, y: int(x) + y, (lrange, rrange), (lextend, rextend))) if verbose > 2: print('New range: {0}-{1}{2}'.format(lrange, rrange, strand), indent=indent) return newrange, strand
def drop_overlaps_bed(bedfile): d = bedfile if isinstance(bedfile, dict) else read_bed(bedfile, key_col=slice(0, 3)) d_new = [] dlocs = {} for loc in d.keys(): if loc[0] in dlocs.keys(): dlocs[loc[0]].append([int(loc[1]), int(loc[2]), loc[3]]) else: dlocs[loc[0]] = [[int(loc[1]), int(loc[2]), loc[3]]] for k, v in dlocs.items(): if len(v) > 1: v = [sorted(i[0:2]) + [i[2]] for i in v] # comparison matrix t = [[max(v[i][0], j[0]) <= min(v[i][1], j[1]) for j in v] for i in range(0, len(v))] # set diagonal identities to False for index in range(0, len(t)): t[index][index] = False # sum per column of matrix t_sums = [sum(i) for i in zip(*t)] # Select only items which have a zero in the t_sums index filtered_v = [v[i] for i in range(0, len(t_sums)) if t_sums[i] == 0] d_new += [(k, i[0], i[1], i[2]) for i in filtered_v] else: try: v = v[0] d_new.append((k, v[0], v[1], v[2])) except Exception: print(k, v) raise filtered_d = {} for item in d_new: if item in d.keys(): filtered_d[item] = d[item] elif (item[0], item[2], item[1]) in d.keys(): filtered_d[(item[0], item[2], item[1])] = d[(item[0], item[2], item[1])] else: print(item) raise Exception return filtered_d
def fun(self, hit, stat, verbose=False): pat = re.compile('\|\[(.*?):.*\]\|') # regex for items in annotation try: hit_split = hit.description.split('|-|') top_anno = hit_split[1] except ValueError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) return False except IndexError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) return False id_lst = pat.findall(top_anno)[0].strip() if id_lst: _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=verbose) if stat == hit_symbol: return True else: return False
def count_reciprocal_best_hits_from_pandas(pandas_df): pat = re.compile('\|\[(.*?)\]\|') # regex for items in annotation spec_list = list(pandas_df.target_species.unique()) species_counters = {} for species in spec_list: species_counters[species] = Counter() species_results = pandas_df.loc[pandas_df['target_species'] == species] query_list = list(species_results.query_name.unique()) for query in query_list: print(query) query_results = species_results.loc[species_results['query_name'] == query].ix[:, 5:-1] rc_out = [] for i, d in query_results.iterrows(): rc_out += d.tolist() # Annoying shunt rc_out_asfasta = '\n'.join(['>' + i for i in rc_out if i is not None]) tmp = StringIO(rc_out_asfasta) rc_out = SeqIO.parse(tmp, 'fasta') for hit in rc_out: try: hit_split = hit.description.split('|-|') id_lst = ''.join(pat.findall(hit_split[1])) except ValueError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) continue if id_lst: _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0) else: print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name, species, query)) continue if query == hit_symbol: species_counters[species].update({query: 1}) return species_counters
def bed_extract_duplicates(bedfile, outfile="", verbose=False): bedfile = Path(bedfile) assert bedfile.exists(), "Given bedfile path does not exist!" assert bedfile.is_file(), "Given bedfile path was not a file! Did you provide a directory?" bed_dict = read_bed(bedfile) hits = sorted(bed_dict.keys()) counts = Counter((''.join(hit.split("_")[:-1]) for hit in hits)) duphits = (hit for hit in hits if counts[hit.split("_")[0]] > 1) outfile = Path(outfile) if outfile else bedfile.with_suffix(".bed.dups") try: first = next(duphits) if verbose: print(first, "\t", counts[first.split("_")[0]]) with outfile.open("w") as of: of.write("\t".join((str(i) for i in bed_dict[first])) + "\n") for hit in duphits: if verbose: print(hit, "\t", counts[hit.split("_")[0]]) of.write("\t".join((str(i) for i in bed_dict[hit])) + "\n") except StopIteration: if verbose: print("No duplicates found in file!")
def __call__(self, sub_db_name, server): if self.verbose: print('\tFetching sequence: ', self.identifier, indent=self.indent) try: dtbase = server[sub_db_name] except KeyError as err: print('Woah! KeyError!', err, indent=self.indent) print( 'Waiting for 0.1 second and rerunning in case it was a collision!', indent=self.indent) sleep(0.1) try: dtbase = server[sub_db_name] except KeyError: raise seqrec = biosql_seq_lookup_cascade(dtbase=dtbase, sub_db_name=sub_db_name, id_type=self.id_type, indent=self.indent, identifier=self.identifier, verbose=self.verbose) return self.identifier, seqrec
def __call__(self, seq_record, species, database, database_path, local, indent, perc_ident, verbose, database_port=None, expect=None, megablast=True, n_threads=1, write=False, filetype=None, **kwargs): # query_length = len(seq_record) if isinstance(database, Path): return self.load(database) elif isinstance(database, str) and database != 'stop': return self.load(Path(database)) elif database == 'stop': raise StopRecBlast() elif self.search_type in [ "blastn", "blastp", "blastx", "tblastx", "tblastn" ]: if verbose > 1: print(self.search_type, 'was selected.', indent=indent) dt = self.blast_prep(search_type=self.search_type, db_loc=database_path, database=database, species=species, verbose=verbose, indent=indent) return self.blast_run(seq_record=seq_record, species=species, database=dt.name, filetype=filetype, blast_type=self.search_type, local_blast=local, expect=expect, megablast=megablast, use_index=False, perc_ident=perc_ident, verbose=verbose, indent=indent, n_threads=n_threads, blastdb=database_path, outtype=5, return_raw=False, **kwargs) elif self.search_type in [ 'blat', 'tblat', 'blat-transcript', 'tblat-transcript' ]: if verbose > 1: print(self.search_type, 'was selected.', indent=indent) port = self.blat_prep(database_port=database_port, species=species, verbose=verbose, indent=indent) return self.blat_run(seq_record=seq_record, local=local, port=port, filetype=filetype, blat_type=self.search_type, perc_ident=perc_ident, verbose=verbose, indent=indent, blatdb=database_path, outtype='pslx') else: raise SearchEngineNotImplementedError( 'Invalid selection for search type!')
def id_search(id_rec, id_type='brute', verbose=2, indent=0, custom_regex=None, regex_only=False): """ EX: gi = refseq_accession = 'XP_010883249.1' scaffold = 'scaffold_145\t[:1033526-1034566](-)\t190 id = chr = 'chrX[:3047971-3259961](-)119' seq_range = assembly1 = 'KN678312.1 [:9787-29116](+) 478' assembly2 = 'KN678312.1 [:9787-29116](+) 478' symbol = 'TP53' symbol = 'INS [:259-568](+) (161)' strand = '+' :param id_rec: :param id_type: :param custom_regex: :param regex_only: :param verbose: :param indent: :return: """ # Define the regex functions p = dict( gi=re.compile('(\Agi[| _:]+[0-9.]+)' '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), accession=re.compile( '(\A[AXNYZ][MWRCPGTZ][| _:]+[0-9.]+|\Aref[| _:]+[0-9.]+)' '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), scaffold=re.compile('(\Ascaffold[| _:]+[0-9.]+)' '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), id=re.compile('(\Aid[| _:]*[0-9.]+)' '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), chr=re.compile('(\Achr[| _:]*[A-Za-z0-9.]+)' '([| \t:_])??\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), assembly=re.compile('(\A[A-Za-z]+[0-9.]+)' '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), assembly_broad=re.compile( '(\b[ALYB]+[0-9.]+)' '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), symbol=re.compile( '(\A\S+)([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'), seq_range=re.compile(':?(\d+)-(\d+)'), strand=re.compile('(\([-+0N]\))'), score=re.compile('\d\d*')) if custom_regex is not None: p = {'custom': custom_regex} id_type = 'custom' # Begin search: if verbose > 1: print('ID Loaded, performing regex search for identifiers...', indent=indent) print('ID type: ', id_type, indent=indent) if id_type == 'brute': for tmp_type in [ 'accession', 'gi', 'scaffold', 'id', 'chr', 'assembly', 'assembly_broad', 'symbol' ]: if bool(p[tmp_type].findall(id_rec)): if verbose > 1: print( 'Brute Force was set, tested strings for all pre-registered IDs.', indent=indent) print('ID was selected as type {0}!'.format(tmp_type), indent=indent + 1) if regex_only: return p[tmp_type] else: return id_search(id_rec=id_rec, id_type=tmp_type, verbose=verbose, indent=indent) raise IDError( 'Couldn\'t identify the id type of line: {}!'.format(id_rec)) else: try: item_parts = p[id_type].findall(id_rec)[0] if verbose > 1: print( 'Successfully found {0}, compiling list!'.format(id_type), indent=indent) print('Item:\t', '\t'.join(item_parts), indent=indent + 1) except IndexError: raise IDError( 'Could not identify patterns in {0} with id_type={1}, ' 'is the id_search sequence correct?'.format(id_rec, id_type)) try: item_parts = list(item_parts) item_parts[0] = item_parts[0] if not isinstance( item_parts[0], str) else ''.join(item_parts[0]) if item_parts[2]: try: sr_tuple = p['seq_range'].findall(item_parts[2])[0] if verbose > 1: print('Found sequence delimiters in IDs!', indent=indent) print(sr_tuple, indent=indent + 1) except IndexError: raise IDError( 'A positive match for a sequence range was found ' '({0}), yet no hits were identified! Confirm that ' 'the regex is correct and try again!'.format( item_parts[2])) else: sr_tuple = (0, -1) if item_parts[4]: try: strand = p['strand'].findall(item_parts[4])[0] except IndexError: strand = '(N)' try: score = p['score'].findall(item_parts[4])[0] except IndexError: score = 0 else: strand = '(N)' score = '0' if verbose > 1: if strand != '(N)': print('Strand info found: {0}'.format(strand), indent=indent) if score != '0': print('Score info found: {0}'.format(score), indent=indent) seq_range = (int(sr_tuple[0]), int(sr_tuple[1]), strand, int(score)) return p, item_parts[0], seq_range, id_type except IndexError: raise IDError( 'Could not identify patterns in {0} with id_type={1}, ' 'is the id_search sequence correct?'.format(id_rec, id_type))
def get_searchdb(search_type, species, db_loc, verbose=1, indent=0): """Finds and returns the appropriate search database for the given species and search type. This function automates the process of selecting the search database needed by the selected search program, like BLAST or BLAT, so that the user does not need to preoccupy themselves with providing said information for a large number of species. For BLAST* that depend on protein databases (BLASTP and BLASTX), the function searches for files matching the form 'Genus_species_protein.*' in the given directory; for BLAST* that depend on DNA databases (BLASTN, TBLASTN, and TBLASTX), it instead looks for files 'Genus_species_genome.*'. If '-transcript' is added to the end of any of the DNA-dependent BLAST*, then instead the function will search for files in the style of 'Genus_species_transcript.*'. In the case of BLAT searches, the program will similarly search for 'Genus_species*.2bit', or for 'Genus_species*transcript.2bit' if '-transcript' is added after the search type. In all usage cases, if the program does not find files matching the 'Genus_species' format, it will try to find the files using a case-insensitive search using the 6-letter abbreviated form of the species name. Usage:: >>> get_searchdb('blastp', 'H**o sapiens', '/path/to/search/files') /path/to/search/files/Homo_Sapiens_protein.* >>> get_searchdb('tblastn', 'H**o sapiens', '/path/to/search/files') /path/to/search/files/HomSap_genome.* >>> get_searchdb('blastn-transcript', 'H**o sapiens', '/path/to/search/files') /path/to/search/files/HomSap_transcript.* >>> get_searchdb('blat', 'H**o sapiens', '/path/to/search/files') /path/to/search/files/HomSap.2bit >>> get_searchdb('blat-transcript', 'H**o sapiens', '/path/to/search/files') /path/to/search/files/HomSap_transcript.2bit Arguments:: :param str search_type: The name of the search method (blast or blat, and sub-type: blastp, blastn, blat, tblat...) :param str species: Name of species associated with the database. If there is a space, it will be replaced with an underscore. :param str db_loc: Path to folder containing collection of search databases. :param int verbose: How verbose should the output be. Zero suppresses all output, 2 is max verbosity. :param int indent: Indent level for printed output. :return str: Path to the identified search database. """ if verbose: print('Search DB set to auto, choosing search_db...', indent=indent) species = species.replace(' ', '_') if verbose > 1: print('Search DB location set to: ', db_loc, indent=indent) db_type_dict = { 'blastx': "protein", 'blastp': "protein", 'blastn': "genome", 'tblastn': "genome", 'tblastx': "genome", 'blastn-transcript': "transcript", 'tblastn-transcript': "transcript", 'tblastx-transcript': "transcript", 'blat': "blat", 'tblat': "blat", 'blat-transcript': 'blat-transcript', 'tblat-transcript': 'tblat-transcript' } try: db_type = db_type_dict[search_type] except KeyError: print('Unable to determine search db type!', indent=indent) raise SearchError( 'Improper search type given ({})!'.format(search_type)) if verbose > 1: print('DB type: ', db_type, indent=indent) db_path = Path(db_loc).absolute() if not db_path.exists(): db_path = Path(db_loc) if db_path.exists() and db_path.is_dir(): if db_type == 'blat': glob_path = [ i for i in db_path.glob('{0}*.2bit'.format( species.replace(' ', '_'))) ] # Todo: generalize extension elif db_type in ['blat-transcript', 'tblat-transcript']: glob_path = [ i for i in db_path.glob('{0}*transcript.2bit'.format( species.replace(' ', '_'))) ] else: glob_path = [ i for i in db_path.glob('{0}_{1}*'.format( species.replace(' ', '_'), db_type)) ] if not glob_path: if verbose: print( 'No DB found! Trying again with abbreviated species name', indent=indent) species_abbv = ''.join( [i[0:3] for i in species.title().split('_')]) # making it insensitive to case for Glob species_abbv_insensitive = ''.join([ '[{0}{1}]'.format(c.lower(), c.upper()) for c in species_abbv if c.isalpha() ]) if verbose: print('Abbreviated species name: ', species_abbv, indent=indent) print('RegEx species abbreviation: ', species_abbv_insensitive, indent=indent) if db_type == 'blat': glob_path = [ i for i in db_path.glob('{0}*.2bit'.format( species_abbv_insensitive)) ] elif db_type in ['blat-transcript', 'tblat-transcript']: glob_path = [ i for i in db_path.glob('{0}*transcript.2bit'.format( species_abbv_insensitive)) ] else: glob_path = [ i for i in db_path.glob('{0}_{1}*'.format( species_abbv_insensitive, db_type)) ] try: if verbose: print(glob_path, indent=indent) if isinstance(glob_path, list): search_db = sorted(glob_path, reverse=True)[0] else: search_db = glob_path except IndexError: print('WARNING: COULD NOT FIND DATABASE! ABORTING!', indent=indent) raise DatabaseNotFoundError('', 'No databases were found!') else: raise DatabaseNotFoundError('DB_Path {} does not exist!'.format( str(db_path))) if verbose: print('{0} DB chosen: {1}'.format(search_type, str(search_db)), indent=indent) return search_db
def blat_server(twobit, order='start', host='localhost', port=20000, type='blat', log='/dev/null', species=None, search_db_loc='/usr/db/blat', verbose=1, indent=0, try_limit=10, **kwargs): """Convenience function that controls a gfServer. Still in alpha. This function serves as a python wrapper for the Bash gfServer command. The user can either provide a .2bit file, or else can provide a species and set 'twobit="auto"' to have the function use 'get_searchdb()' to find a .2bit file automatically. By default, the function is set to start up a new gfServer instance, but using the 'order' parameter, the user can execute any of the standard gfServer commands such as 'stop' and 'status'. To start a gfServer, the function first probes the selected port (default is 20000) to ensure its unused; if it is currently in use, the program then goes port-by-port in ascending order until it finds an empty port to use for the server. Then, it simply calls the gfServer command with all the keyword arguments required, as well as with any extra arguments provided by the user. Usage:: >>>blat_server(twobit='hg38.2bit', port=20000, verbose=3) gfServer start localhost 20001 -canStop -stepSize=5 hg38.2bit # Waits 30 seconds, then starts calling 'gfServer status localhost 20001' every 30 seconds for 5 minutes # If at any point 'gfServer status' returns something that is not an error or "Couldn't connect...", it # returns the port where the server was opened. 20001 >>>blat_server(twobit='auto', port=20000, species='H**o sapiens', verbose=3) # Calls get_searchdb('blat', 'H**o sapiens', db_loc=BLATDB) # Internally, will return a .2bit file such as 'Homo_sapiens.2bit' 20001 >>>blat_server(twobit='hg38.2bit', port=20000, order='status', verbose=3) # If the server is active: 1 >>>blat_server(twobit='hg38.2bit', port=20000, order='status', verbose=3) # If the server either has not been started or is not yet active: 0 >>>blat_server(twobit='hg38.2bit', port=20000, order='status', verbose=3) # If the server returns an error Exception(...) :param str twobit: A path to the .2bit file to be used for the server. Can also be set to 'auto'. :param str order: A command for gfServer. Can be one of the following: start, stop, status, files, query (requires a nucleotide sequence in fasta format), protQuery (requires a protein sequence in fasta format), transQuery (requires a nucleotide sequence in fasta format), pcr (requires arguments fPrimer, rPrimer, maxDistance), direct (requires probe.fa, file(s).nib), or pcrDirect (requires fPrimer, rPrimer, file(s).nib). :param str host: Address at which to host the server. :param int port: Port number that will be assigned to server. If in use, will test new port number in increments of 1 until a free port is found. :param str type: Type of server to be hosted. 'blat' will start a DNA server, 'tblat' will start a DNAX server for protein queries. :param str log: Path and name of log file to be written. :param str species: Species name that get_searchdb() will use to find .2bit file when twobit='auto'. :param str search_db_loc: Path to the folder containing .2bit file. :param int verbose: Level of verbosity of function output. 0 suppresses all output, 3 is max verbosity. :param int indent: Indentation level of print output. :param int try_limit: Number of tries at 30-second intervals that function should probe the gfServer before timeout. :param kwargs: keyword arguments to be passed on to gfServer. :return: if order='start', returns the port of the new gfServer; if order='status', returns 0 if there was no connection, or 1 if the server is active and responding. """ # Regular: gfServer start localhost portX -stepSize=5 -log=untrans.log database.2bit # Prot>DNAX: gfServer start localhost portY -trans -mask -log=trans.log database.2bit gfserver_suppl_args = list() if twobit == 'auto' and order != 'stop': if verbose: print('2bit set to auto: searching for 2bit file for species ', species, indent=indent) twobit = get_searchdb(search_type='blat', species=species, db_loc=search_db_loc, verbose=verbose, indent=indent + 1) if twobit.exists() and twobit.is_file(): twobit = twobit.name else: raise BLATServerError('Invalid 2bit file!') for key, item in kwargs.items(): if key == 'order': order = item elif key == 'host': host = item elif key == 'port': port = item else: gfserver_suppl_args.append('-{0}={1}'.format(key, item)) if order == 'status': gfcheck = subprocess.Popen('gfServer status {0} {1}'.format( str(host), str(port)), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=True, executable='/bin/bash') out, _ = gfcheck.communicate() if "couldn't connect to localhost" in out.lower(): return 0 elif "error" in out.lower(): raise BLATServerError(out) else: return 1 elif order == 'stop': subprocess.check_call('gfServer stop {0} {1}'.format( str(host), str(port)), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=True, executable='/bin/bash') return else: print(order) # Todo: make the portsniffer its own function and make sure it works properly. portfinder = subprocess.check_output( '/home/manny/Scripts/oneshot/checkifportisopen.sh {}'.format( str(port)), universal_newlines=True, shell=True, executable='/bin/bash') port = portfinder.rstrip() gfserver_cmd = [ 'gfServer', str(order), str(host), str(port), '-canStop' ] if type == 'blat': gfserver_cmd.append('-stepSize=5') elif type == 'tblat': gfserver_cmd += ['-trans', '-mask'] if gfserver_suppl_args: gfserver_cmd += gfserver_suppl_args gfserver_cmd_str = ' '.join(gfserver_cmd + [twobit]) if verbose > 2: print(gfserver_cmd_str, indent=indent) subprocess.Popen(gfserver_cmd_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=True, executable='/bin/bash') tries = 0 while tries <= try_limit: sleep(30) gfcheck = subprocess.Popen('gfServer status {0} {1}'.format( str(host), str(port)), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=True, executable='/bin/bash') out, _ = gfcheck.communicate() if verbose > 2: print(out) if "couldn't connect to localhost" in out.lower(): tries += 1 elif "error" in out.lower(): raise BLATServerError(out) else: if verbose: print(out) return port if tries > try_limit: raise TimeoutError('Timed out!')
def id_ranker(record, perc_score, perc_query_span, perc_ident, expect=None, indent=0, verbose=1, same_strand=True, return_only=None): """Filters results based on score, expectation value, length, percent identity, and span; returns a sorted list. :param query_record record: Either a SearchIO.QueryResult or a Bio.Blast.Record. :param float perc_score: Minimum percentage of top score for a hit. :param float expect: Maximum e-value for a hit (BLAST-only). :param float perc_query_span: Minimum percent of the longest hit by query coverage for a hit. :param int perc_ident: Minimum percent identity of a hit. :param int indent: Indent level for pretty print. [Default: 0] :param int verbose: Level of verbose output? [Default: 1] :param bool same_strand: Should the function filter hits with HSPs on different strands? [Default:True] :param return_only: Should all or only one id be returned? :return list: Returns a list of tuples containing the final hit data in BED6 format. """ id_list = [] if verbose: print('Beginning ID_Ranker...', indent=indent) if record.program == 'blat': if verbose > 2: print('Results obtained from BLAT run.', indent=indent + 1) elif 'blast' in record.program: if verbose > 2: print('Results obtained from BLAST run.', indent=indent + 1) else: raise NotImplementedError('Sorry, your program {} is not yet ' 'implemented for RecBlast!'.format( record.program)) # Create filter functions: def hsp_minscores(hsp): return hsp.score >= int(perc_score * top_score) def hsp_min_query_span(hsp): return hsp.query_span >= perc_query_span * top_length def hsp_perc_ident(hsp): return hsp.ident_pct >= perc_ident def hsp_same_strand(hsp): if same_strand: return all( [i == hsp.hit_strand_all[0] for i in hsp.hit_strand_all]) else: return True def hit_sort_scores(hit): return sum([hsp.score for hsp in hit.hsps]) def hsp_sort_scores(hsp): return hsp.score # Get top stats: top_score = max([max([hsp.score for hsp in hit.hsps]) for hit in record]) if verbose > 1: print('Top score for {}:\t'.format(record.id), top_score, indent=indent) top_length = max([max([hsp.query_span for hsp in hit]) for hit in record]) if verbose > 1: print('Longest hit for {}:\t'.format(record.id), top_length, indent=indent) if verbose > 2: print("ALL HITS STATS:") print('|\tHit Name:\t|\t# HSPs\t|\tScore:\t|\tLength:\t|\tP.Ident\t|') print("==========================================================") for hit in record: name = hit.id n_hsp = len(hit.hsps) print('|\t{HitName}\t|\t{HSP}\t|'.format(HitName=name, HSP=n_hsp)) print("------------------------------------------------------") for hsp in hit: print( '|\t{id}\t|\t{hf}\t|\t{score}\t|\t{length}\t|\t{ident}\t|'. format(id=hsp.hit_id, hf=len(hsp), score=hsp.score, length=hsp.hit_span, ident=hsp.ident_pct)) # Execute filters: # query_span if verbose > 1: print('Number of HSPs for {}:\t'.format(record.id), sum([len(i.hsps) for i in record]), indent=indent) print('Filtering out all HSPs shorter than {}...'.format( perc_query_span * top_length), indent=indent) record = record.hsp_filter( hsp_min_query_span) if perc_query_span else record if not record: text = ( 'No hits in Query Results match a stretch of the query sequence longer than ' '{0}!').format((top_length * perc_query_span)) raise NoHitsError(text) # Score if verbose > 1: print('Number of HSPs for {}:\t'.format(record.id), sum([len(i.hsps) for i in record]), indent=indent) print('Filtering out all HSPs with scores less than {}...'.format( top_score * perc_score), indent=indent) record = record.hsp_filter(hsp_minscores) if perc_score else record if not record: text = 'No hits in Query Results have a score above the minimum of {0}!'.format( (top_score * perc_score)) raise NoHitsError(text) if verbose > 1: print('Number of HSPs for {}:\t'.format(record.id), sum([len(i.hsps) for i in record]), indent=indent) print( 'Filtering out all HSPs with percent identity below {}...'.format( perc_ident), indent=indent) record = record.hsp_filter(hsp_perc_ident) if perc_ident else record if not record: text = 'No hits in Query Results have a percent identity above {}%!'.format( round(perc_ident * 100, 2)) raise NoHitsError(text) if verbose > 1: print('Number of HSPs for {}:\t'.format(record.id), sum([len(i.hsps) for i in record]), indent=indent) if same_strand: print( 'Filtering out all HSPs that have fragments on opposite strands...' ) record = record.hsp_filter(hsp_same_strand) if same_strand else record if not record: text = 'No hits in Query Results are on the same strand!' raise NoHitsError(text) # Sorting them for good measure if verbose > 1: print('Sorting all hits by descending scores!', indent=indent) record.sort(key=hit_sort_scores, reverse=True, in_place=True) for hit in record: hit.sort(key=hsp_sort_scores, reverse=True, in_place=True) if verbose > 1: print('Done!', indent=indent) # Add items to id_list # Big note: think in HSPs, not Hits n = 1 for hit in record: for hsp in hit: # some quick strand math: if hsp._has_hit_strand: strands = set(hsp.hit_strand_all) if len(strands) == 1: strand = "+" if strands == {1} else "-" else: strand = "." else: strand = "." if verbose > 2: print("Adding hit {chr}:{s}-{e}({st}) to id list".format( chr=hsp.hit_id, s=str(hsp.hit_range[0]), e=str(hsp.hit_range[1]), st=strand), indent=indent) # A little witchcraft before we do though # turns out hsp.hit_start_all won't necessarily start with the starting point of the hit... # That means we need to zip hit_start_all and hit_span_all, sort by the first one, then de-zip. block_starts, block_spans = zip(*sorted( zip(hsp.hit_start_all, hsp.hit_span_all), key=itemgetter(0))) # chr (start,end) id score strand thickStart thickEnd rgb blockcount blockspans blockstarts query_span id_list.append([ hsp.hit_id, hsp.hit_range, hsp.query_id, hsp.score, strand, hsp.hit_range[0], hsp.hit_range[1], "255,0,0", len(hsp.hit_start_all), ",".join([str(i) for i in block_spans]), ",".join([str(i - hsp.hit_range[0]) for i in block_starts]), hsp.query_range ]) if return_only and n == return_only: print('Returning only the top {} hits, ending here!'.format( return_only), indent=indent) return id_list n += 1 return id_list
def count_reciprocal_best_hits(recblast_out): pat = re.compile('\|\[(.*?)\]\|') # regex for items in annotation species_counters = {} for species, species_dict in recblast_out.items(): species_counters[species] = Counter() for query, query_dict in species_dict.items(): try: rc_out = query_dict['recblast_results'] except KeyError: print('No entries in recblast_results for query {0} in species {1}'.format(query, species)) continue for hit in rc_out: try: hit_split = hit.description.split('|-|') target_id = hit_split[0] annotations = hit_split[1] except ValueError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) continue except IndexError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) continue id_lst = ''.join(pat.findall(annotations)) if id_lst: _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0) else: print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name, species, query)) continue if query == hit_symbol: species_counters[species].update({query: 1}) return species_counters
def blast_run(seq_record, species, database, blast_type, filetype="fasta", local_blast=False, expect=0.005, megablast=True, use_index=False, perc_ident=75, verbose=True, indent=0, n_threads=1, blastdb='/usr/db/blastdb/', outtype=5, return_raw=False, **kwargs): """A wrapper function for BLAST searches. :param seq_record: The record containing the query sequence for the search. Can be either a SeqIO.SeqRecord or a string with the file loaction. :param str species: The species whose sequence database will be queried. :param Union[dict, str, Path] database: The name of the database to be used in the search. :param str blast_type: Type of BLAST search being performed :param str filetype: Filetype of seq_record (if seq_record is a SeqRecord object, leave as default. [default: 'fasta'] :param bool local_blast: Should the search be conducted locally or on remote servers? (BLAT searches are always local.) [Default: False] :param float expect: Highest expect value of BLAST results to be returned. [Default: 0.005] :param bool megablast: Should MegaBLAST be used for nucleotide searches? [Default: True] :param bool use_index: Should BLAST use indexes associated with the database files? [Default: False] :param int perc_ident: Minimum percent identity required of results to be returned [Default: 75] :param bool verbose: Verbose output? [Default: True] :param int indent: Indent level for pretty print. [Default: 0] :param int n_threads: Number of threads to allocate for BLAST [Default: 1] :param str blastdb: Path of databases for either BLAST or BLAT. [Default: '/usr/db/blastdb' :param int outtype: Output type. (see options for BLAST and BLAT) [Default: pslx] :param bool return_raw: Return raw output rather than processed BioBlastRecord? [Default: False] :param kwargs: Additional keyword arguments to pass on to BLAST/BLAT. :return: blast_record, blast_err """ if isinstance(seq_record, SeqIO.SeqRecord): pass else: seq_record = SeqIO.read(seq_record, filetype) args = dict() if verbose: print("Now starting BLAST...", indent=indent) if local_blast: # build up the BLAST arguments: args.update({ '-db': str(database), '-evalue': expect, '-outfmt': str(outtype), '-num_threads': n_threads }) if blast_type == 'blastn': if megablast: args['-task'] = 'megablast' if use_index: args['-use_index'] = use_index args['-perc_identity'] = perc_ident args_expanded = list() [(args_expanded.append(j), args_expanded.append(k)) for j, k in args.items()] if verbose: print('Running BLAST locally...', indent=indent) print('Options:', indent=indent) print(args_expanded, indent=indent + 1) if blast_type in [ "blastn", "blastp", "blastx", "tblastx", "tblastn" ]: blast_cline = [blast_type] + args_expanded try: blast_handle = subprocess.check_output( [str(i) for i in blast_cline], input=seq_record.format('fasta'), universal_newlines=True, cwd=blastdb) if isinstance(blast_handle, str): blast_result = blast_handle blast_err = None else: blast_result, blast_err = blast_handle except subprocess.CalledProcessError: raise else: raise SearchError("Invalid blast choice!") else: args.update( dict(program=str(blast_type), database=str(database), sequence=seq_record.format('fasta'), entrez_query='"{}"[ORGN]'.format(species), expect=expect, perc_ident=perc_ident)) if megablast & (blast_type == 'blastn'): args['megablast'] = 'True' if kwargs: args.update(**kwargs) if verbose: print('Submitting Remote BLAST! Options passed:', indent=indent) for k, v in args.items(): print('{0}\t=\t{1}'.format(k, v), indent=indent + 1) try: blast_result = NCBIWWW.qblast(**args) blast_err = None except Exception as err: print(type(err), err) raise err if verbose: print('Done with Blast!', indent=indent) if return_raw: return blast_result, blast_err else: if isinstance(blast_result, StringIO): blast_record = NCBIXML.read(blast_result) else: try: with StringIO(''.join(blast_result)) as fin: blast_record = NCBIXML.read(fin) except Exception as err: print('Error reading Blast Results! Aborting!', indent=indent) print('Error details:\n', err, indent=indent) raise err return blast_record, blast_err
def count_dups(recblast_out): """ Inverts target-annotation dictionary to find out, for every best-hit annotation, how many targets there are""" species_anno_target_dict = {} species_anno_count_dict = {} master_dict = simple_struct(recblast_out, verbose=False) for species, species_dict in master_dict.items(): try: anno_target_dict = species_anno_target_dict[species] except KeyError: species_anno_target_dict[species] = {} anno_target_dict = species_anno_target_dict[species] print(species_dict, indent=0) for query, query_dict in species_dict.items(): # ignoring query print(query_dict, indent=1) for target_id, annotation_list in query_dict.items(): print(annotation_list, indent=2) tophit = annotation_list[0] print(tophit, indent=2) try: anno_target_dict[tophit] += [target_id] except KeyError: anno_target_dict[tophit] = list() anno_target_dict[tophit].append(target_id) print(anno_target_dict[tophit], indent=3) for species, anno_dict in species_anno_target_dict.items(): print(species, indent=0) try: anno_count_dict = species_anno_count_dict[species] except KeyError: species_anno_count_dict[species] = {} anno_count_dict = species_anno_count_dict[species] for annotation, target_list in anno_dict.items(): print(annotation, '\t\t\t', len(target_list)) anno_count_dict[annotation] = len(target_list) return species_anno_target_dict, species_anno_count_dict
def simple_struct(recblast_out, verbose=True): """Returns a nice diagram of queries, targets, and annotations""" master_dict = {} pat = re.compile('\|\[(.*?)\]\|') # regex for items in annotation if isinstance(recblast_out, list): # Prepare a list of dictionaries of length recblast_out, along with a list of respective species master_count = [dict] * len(recblast_out) for index, rc in enumerate(recblast_out): try: master_count[index] = simple_struct(rc) except AttributeError: master_count[index] = rc for subdict in master_count: for species, species_dict in subdict.items(): if isinstance(species_dict, Exception): continue try: comb_spec_dict = master_dict[species] except KeyError: master_dict[species] = dict() comb_spec_dict = master_dict[species] for query, query_dict in species_dict.items(): try: comb_query_dict = comb_spec_dict[query] except KeyError: comb_spec_dict[query] = dict() comb_query_dict = comb_spec_dict[query] for target_id, annotation_list in query_dict.items(): try: comb_anno_list = comb_query_dict[target_id] except KeyError: comb_query_dict[target_id] = list() comb_anno_list = comb_query_dict[target_id] comb_anno_list += annotation_list if isinstance(annotation_list, list) else [annotation_list] return master_dict else: """ Structure: master_dict: Species| species_dict: Query| query_dict: target_id| annotations_list """ # assert isinstance(recblast_out, RecBlastContainer), 'Item in recblast_out was not a RecBlastContainer object!' try: recblast_out.__delitem__('__dict__') except KeyError: pass for species, rc_spec_rec in recblast_out.items(): # print('Species:\t', species, indent=0) try: species_dict = master_dict[species] except KeyError: master_dict[species] = dict() species_dict = master_dict[species] for query, rc_rec in rc_spec_rec.items(): # print('Query:\t', query, indent=1) try: query_dict = species_dict[query] except KeyError: species_dict[query] = dict() query_dict = species_dict[query] try: rc_out = rc_rec['recblast_results'] except KeyError: print('No entries in recblast_results for query {0} in species {1}'.format(query, species)) continue for record in rc_out: try: # print(record.description, indent=3) target_id, annotations = record.description.split('|-|') # print('Target ID:\t', target_id, indent=4) # print('Annotations:', annotations.lstrip('\t'), indent=4) except ValueError: print(record.description, indent=2) # print('Could not unpack annotations!', indent=2) continue try: target_list = query_dict[target_id] except KeyError: query_dict[target_id] = list() target_list = query_dict[target_id] id_lst = pat.findall(annotations) # print('id_list:\t', id_lst, indent=4) if id_lst: target_list += id_lst else: print('No annotations found for record {0} in species {1}, query {2}'.format(record.name, species, query)) if verbose: print('*******************************************') for species, species_dict in master_dict.items(): print(species, indent=0) for query, query_dict in species_dict.items(): print(query, indent=1) for target_id, annotation_list in query_dict.items(): print(target_id, indent=2) tmp = [] for annotation in annotation_list: p, item, seq_range, id_type = id_search(annotation, id_type='brute', verbose=0) if id_type == 'symbol': tmp.append(item) else: tmp.append(item) query_dict[target_id] = tmp for annotation in query_dict[target_id]: print(annotation, indent=3) print('*******************************************') return master_dict
def biosql_seq_lookup_cascade(dtbase, sub_db_name, id_type, identifier, indent=0, verbose=False): seqrec = SeqRecord(seq='') try_get_id = True if id_type == 'scaffold': lookup_key = 'name' else: lookup_key = id_type if try_get_id: try: if verbose: print("\t\tNow searching database {0} for {1}: {2}".format( sub_db_name, id_type, identifier), indent=indent) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(**{lookup_key: identifier})) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except KeyError: sleep(0.1) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(**{lookup_key: identifier})) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except IndexError as err: if verbose: print( "WARNING: couldn't find {0} using given ID type... \n Full error: {1}" .format(identifier, err), indent=indent) if try_get_id: identifier_sans_subnumber = identifier.split('.')[0] if verbose: print( '\t\tSeeing if removing any sub-numbers (acc: xxxxxx.1 for example) helps...', indent=indent) print('\t\tIdentifier: ', identifier_sans_subnumber, indent=indent) try: if verbose: print("\t\tNow searching database {0} for {1}: {2}".format( sub_db_name, id_type, identifier_sans_subnumber), indent=indent) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(**{lookup_key: identifier_sans_subnumber})) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except KeyError: sleep(0.1) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(**{lookup_key: identifier_sans_subnumber})) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except IndexError as err1: if verbose: print( "WARNING: couldn't find {0} using abbreviated ID... \n Full error: {1}" .format(identifier_sans_subnumber, err1), indent=indent) if try_get_id: try: if verbose: print( '\t\tAttempting to search using Primary ID instead of declared type:', indent=indent) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(primary_id=identifier)) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except KeyError: sleep(0.1) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(primary_id=identifier)) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except IndexError as err2: if verbose: print( "WARNING: couldn't find {0} using Primary ID... \n full error: {1}" .format(identifier, err2), indent=indent) if try_get_id: try: if verbose: print( '\t\tAttempting to search using name instead of declared type:', indent=indent) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(name=identifier)) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except KeyError: sleep(0.1) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(name=identifier)) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) try_get_id = False except IndexError as err3: if verbose: print( "WARNING: Still couldn't find {0} using name search: \n full error: {1}" .format(identifier, err3), indent=indent) if try_get_id: try: lookup_key = input( 'Last shot, chose an ID type: ' '[accession, primary_id, gi, version, display_id, name]') if lookup_key == 'exit': exit(exit()) seqrec = biosql_dbseqrecord_to_seqrecord( dtbase.lookup(**{lookup_key: identifier})) if verbose: print('\tGot sequence for {}!'.format(identifier), indent=indent) except IndexError as err5: if verbose: print( "WARNING: COULD NOT FIND SEQUENCES FOR ID:{0}: \n full error: {1}" .format(identifier, err5), indent=indent) return seqrec
def fetchseq(ids, species, write=False, output_name='', delim='\t', id_type='brute', server=None, source="SQL", database="bioseqdb", database_path=None, host='localhost', driver='psycopg2', version='1.0', user='******', passwd='', email='', batch_size=50, output_type="fasta", verbose=1, n_threads=1, n_subthreads=1, add_length=(0, 0), indent=0): if isgenerator(ids): if verbose > 1: print('Received generator!', indent=indent) elif isinstance(ids, list): if verbose > 1: print('Received list!', indent=indent) else: if verbose > 1: print('Reading ID File... ', indent=indent) with ids.open('w') as in_handle: id_prelist = [line.strip() for line in in_handle ] # list of each line in the file print('Done!', indent=indent) ids = [id_item for id_item in filter(None, id_prelist) if id_item] if not id_prelist or id_prelist is None: if verbose: print('id_prelist is empty!', indent=indent) return 'None' for id_item in ids: assert len(id_item) == 12, ( "Item {0} in id_list has {1} items, not 5!\n" "Format should be: " "chr, (start,end), id, score, strand, thickStart, thickEnd, rgb, blockcount," " blockspans, blockstarts, query_span" "!").format( " ".join((" ".join(item) if not isinstance(item, str) else item for item in id_item)), len(id_item)) if verbose > 1: print('Readied ids!', indent=indent) id_list = multiprocessing.JoinableQueue() results = multiprocessing.Queue() if 'sql' in source.lower(): if server is None: try: if verbose > 1: print('No server received, opening server...', indent=indent) server = BioSeqDatabase.open_database(driver=driver, user=user, passwd=passwd, host=host, database=database) if verbose > 1: print('Done!', indent=indent) except Exception as err: if verbose > 1: print('Failed to open server!', indent=indent) print(str(type(err)), err, sep=' ', indent=indent) raise else: if verbose > 1: print('Received server handle:', indent=indent) print(server, indent=indent) if verbose > 2: print('Please note the sub_databases of server:\n\t', [str(i) for i in server.keys()], indent=indent) elif source.lower() in ['fasta', '2bit', 'twobit']: print('Search type: ', source, indent=indent) else: raise SearchEngineNotImplementedError( 'Search using source {} has not yet been implemented!'.format( source)) if verbose > 1: print('Creating FecSeq Processes...', indent=indent) fs_instances = [ FetchSeqMP(id_queue=id_list, seq_out_queue=results, delim=delim, id_type=id_type, server=server, species=species, source=source, database=database, database_path=database_path, host=host, driver=driver, version=version, user=user, passwd=passwd, email=email, output_type=output_type, batch_size=batch_size, verbose=verbose, n_subthreads=n_subthreads, add_length=add_length, indent=indent + 1) for _ in range(n_threads) ] if verbose > 1: print('Done! Starting processes...', indent=indent) for fs in fs_instances: fs.start() if verbose > 1: print('Done!', indent=indent) print('Assigning FetchSeq records to queue... ', indent=indent) id_order = [] for i, id_rec in enumerate(ids): try: id_order.append("{0}:{1}-{2}".format(id_rec[0], id_rec[1][0], id_rec[1][1])) except IndexError: id_order.append("{0}".format(id_rec[0])) try: id_list.put(FetchSeq(id_rec=id_rec)) except AssertionError as err: print(i, type(err), err, sep=' ') break for _ in fs_instances: id_list.put(None) if verbose > 1: print('Done!', indent=indent) output_dict = dict() missing_items_list = list() if verbose > 1: print('Getting sequences from processes... ', indent=indent) n_jobs = len(ids) while n_jobs: seq, missing = results.get() output_dict[seq[0]] = seq[1] missing_items_list.append(missing) n_jobs -= 1 if verbose > 1: print('Done! Finished fetching sequences!', indent=indent) print('Closing processes!', indent=indent) for fs in fs_instances: if fs.is_alive(): fs.join() output_list = [output_dict[i] for i in id_order if i in output_dict] if write: SeqIO.write(output_list, output_name, output_type) return else: if missing_items_list == [None]: missing_items_list = None return output_list, missing_items_list
def sql(id_item, seq_range, source, species, id_type, user, host, passwd, database, n_threads, version, server, indent, verbose): driver = "mysql" if source.lower() == 'mysql' else "psycopg2" if verbose > 1: print('Searching for sequences in local SQL database...', indent=indent) if verbose > 2: print('Please note the sub_databases of server:\n\t', [str(i) for i in server.keys()], indent=indent) if version.lower() == 'auto': sub_db_list = [] sub_db_name = ''.join([i[0:3] for i in species.title().split(' ')]) for sub_db in server.keys(): if sub_db_name in sub_db: sub_db_list.append(sub_db) if len(sub_db_list) < 1: raise NameError('sub_db does not exist!') elif len(sub_db_list) == 1: sub_db_name = sub_db_list[0] else: if verbose: print('Multiple database versions found!', indent=indent) print(sub_db_list, indent=indent) print('Selecting highest DB', indent=indent) sub_db_name = sorted(sub_db_list, reverse=True)[0] if verbose: print('Sub-DB chosen was ', sub_db_name, indent=indent) else: sub_db_name = ''.join([i[0:3] for i in species.title().split(' ') ]) + version id_list_search = id_item try: seq_dict, itemnotfound = biosql_get_record(sub_db_name=sub_db_name, passwd=passwd, id_list=id_list_search, id_type=id_type, driver=driver, user=user, host=host, database=database, num_proc=n_threads, server=server, verbose=True) except Exception as err: print('Please note the sub_databases of server:\n\t', [str(i) for i in server.keys()], indent=indent) raise err seq_ids = list(seq_dict.keys()) assert len( seq_ids ) == 1, 'Multiple sequences were returned for a single query!' seq = seq_dict[seq_ids[0]] seq = seq[slice(seq_range[0], seq_range[1])] return seq, itemnotfound
def __call__(self, delim, species, version, source, passwd, id_type, driver, user, host, database, n_threads, server, verbose, add_length, indent, database_path=None): if isinstance(database, dict): if species in database: database = database[species] else: raise DatabaseNotFoundError( 'No sequence source database for species {} ' 'was found in the provided dict!'.format(species)) elif database == "auto" and source in ["2bit", "twobit", "blastdb"]: database = get_searchdb(search_type=source, species=species, db_loc=database_path, verbose=verbose, indent=indent + 1).name if database_path: database = database_path.rstrip("/") + '/' + database if verbose > 1: print('Full header for Entry:', indent=indent) print(self.id_rec, indent=indent) (item_chr, seq_range, item_name, score, strand, thickStart, thickEnd, rgb, blockcount, blockspans, blockstarts, query_coverage) = self.id_rec try: if verbose > 1: print('Seq range: ', seq_range, indent=indent) assert len( seq_range) == 2, 'Seq_range returned a tuple of length != 2!!!' old_strand = strand if add_length != (0, 0): seq_range, strand = format_range(seqrange=seq_range, strand=strand, addlength=add_length, indent=indent + 1, verbose=verbose) self.id_rec[1] = seq_range self.id_rec[4] = strand if -1 in seq_range[0:2]: id_full = '{0}'.format(item_chr) else: id_full = '{0}:{1}-{2}'.format(item_chr, seq_range[0], seq_range[1]) except KeyError: raise KeyError( 'Sequence {0} lacks a seq_range entry!!!'.format(item_chr)) if verbose: print('ID for query:\t', id_full, indent=indent) # Armed with the ID list, we fetch the sequences from the appropriate source if source.lower() == "entrez": seq, itemnotfound = self.entrez(item_chr, seq_range, indent, add_length, verbose) elif source.lower() in ["postgresql", "mysql"]: seq, itemnotfound = self.sql(id_item=item_chr, seq_range=seq_range, source=source, species=species, id_type=id_type, user=user, host=host, passwd=passwd, database=database, n_threads=n_threads, version=version, server=server, indent=indent, verbose=verbose) elif source == "fasta": # Note: anecdotally, this doesn't run terribly fast - try to avoid. seq, itemnotfound = self.fasta(id_item=item_chr, seq_range=seq_range, database=database, source=source, indent=indent, verbose=verbose) elif source in ["2bit", "twobit"]: seq, itemnotfound = self.twobit(id_full=id_full, id_item=item_chr, database=database, indent=indent, verbose=verbose) else: raise DatabaseNotFoundError( 'Not a valid database source: {}'.format(source)) if itemnotfound is not None: if verbose > 1: print('Some items were not found:', indent=indent) print(itemnotfound, indent=indent) if old_strand != strand: if verbose > 1: print('Sequence was inverted! Reverse complementing now...', indent=indent) seq.seq = seq.seq.reverse_complement() if verbose > 1: print('Done!', indent=indent) seq.features.append(SeqFeature.SeqFeature(type='duplicate')) if strand == '+': s = 1 elif strand == '-': s = -1 else: s = "." seq.features[0].location = SeqFeature.FeatureLocation( int(seq_range[0]), int(seq_range[1]), strand=s) seq.features[0].qualifiers['score'] = score seq.features[0].qualifiers['query_coverage'] = query_coverage seq.features[0].qualifiers['thickStart'] = thickStart seq.features[0].qualifiers['thickEnd'] = thickEnd seq.features[0].qualifiers['itemRGB'] = rgb seq.features[0].qualifiers['blockCount'] = blockcount seq.features[0].qualifiers['blockSizes'] = blockspans seq.features[0].qualifiers['blockStarts'] = blockstarts seq.name = item_chr return id_full, seq, itemnotfound
def blat_run(seq_record, port, local="localhost", filetype="fasta", blat_type='blat', perc_ident=None, verbose=True, indent=0, blatdb='/usr/db/blastdb/', outtype='pslx'): """A wrapper function for BLAT searches. :param seq_record: The record containing the query sequence for the search. Can be either a SeqIO.SeqRecord or a string with the file loaction. :param int port: Port of the gfServer to be queried :param str local: Host address. :param str filetype: Filetype of seq_record (if seq_record is a SeqRecord object, leave as default. [default: 'fasta'] :param str blat_type: Type of search to conduct. Can be a BLAST type (blastn, blastp, blastx, tblastn, tblastx) or a BLAT type (blat, tblat). [Default: 'blastn'] :param int perc_ident: Minimum percent identity required of results to be returned [Default: 75] :param bool verbose: Verbose output? [Default: True] :param int indent: Indent level for pretty print. [Default: 0] :param str blatdb: Path of databases for either BLAST or BLAT. [Default: '/usr/db/blastdb' :param str outtype: Output type. (see options for BLAST and BLAT) [Default: pslx] :return: blat_record, blat_err """ if isinstance(seq_record, SeqIO.SeqRecord): pass elif isinstance(seq_record, str): seq_record = SeqIO.read(seq_record, filetype) else: raise TypeError( 'seq_record was of type {}, must be either ' 'a str with filepath or a SeqRecord object!'.format( type(seq_record))) if verbose: print("Now starting BLAT...", indent=indent) if verbose > 1: print('Search Type: ', blat_type, indent=indent) args_expanded = [ 'gfClient', local, str(port), '/', '/dev/stdin', '/dev/stdout' ] args_expanded += ['-t=dnax', '-q=prot' ] if blat_type.lower() == 'tblat' else [] args_expanded += [ 'minIdentity={}'.format(perc_ident if perc_ident else 0), '-out={}'.format(outtype) ] try: if verbose > 1: print('BLAT command:', indent=indent) print(' '.join(args_expanded), indent=indent + 1) blat = subprocess.Popen(args_expanded, stdout=subprocess.PIPE, universal_newlines=True, cwd=blatdb, stdin=subprocess.PIPE, stderr=subprocess.PIPE) blat_raw, blat_raw_err = blat.communicate( input=seq_record.format('fasta')) if blat_raw_err: raise SearchError(blat_raw_err) head = subprocess.Popen(["head", "-n", "-1"], universal_newlines=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) blat_handle = head.communicate(input=blat_raw) if verbose > 2: print(blat_handle[0], indent=indent) if verbose: print('Done!', indent=indent) if isinstance(blat_handle, str): blat_result = blat_handle blat_err = None else: blat_result, blat_err = blat_handle except subprocess.CalledProcessError: raise blat_result, blast_err = blat_result, blat_err blat_record = None with StringIO(blat_result) as fin: try: if outtype == 'pslx': blat_record = SearchIO.read(fin, format='blat-psl', pslx=True) elif outtype == 'psl': blat_record = SearchIO.read(fin, format='blat-psl') elif outtype == 'blast8': blat_record = SearchIO.read(fin, format='blast-tab') elif outtype == 'blast9': blat_record = SearchIO.read(fin, format='blast-tab', comments=True) elif outtype == 'blast': blat_record = SearchIO.read(fin, format='blast-xml') else: raise SearchError('Invalid out type') except ValueError: if verbose: print( 'No Query Results were found in handle for seq_record {}!' .format(seq_record.id)) raise NoHitsError( 'No Query Results were found in handle for seq_record {}!'. format(seq_record.id)) except Exception as err: print('Error reading BLAT results! Aborting!') print('Error details:\n') raise err return blat_record, blat_err
def biosql_get_record(id_list, sub_db_name, passwd='', id_type='accession', driver="psycopg2", indent=0, user="******", host="localhost", database="bioseqdb", num_proc=2, verbose=True, server=None): """ :param sub_db_name: :param passwd: :param id_list: :param id_type: :param driver: :param indent: :param user: :param host: :param database: :param num_proc: :param verbose: :param server: :return: if __name__ == '__main__': biosql_get_record(sub_db_name='MyoLuc2.0', passwd='', id_list=['NW_005871148', 'NW_005871300', 'NW_005871148'], id_type='accession', driver="psycopg2", user="******", host="localhost", database="bioseqdb", verbose=True) """ idents = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # num = multiprocessing.cpu_count() * 2 if verbose > 2: print('\tStarting biosql_get_record_mp', indent=indent) id_list = id_list if isinstance(id_list, list) else [id_list] num_jobs = len(id_list) seqdict = dict() getseqs = [ GetSeqMP(idents, results, database=database, host=host, driver=driver, user=user, passwd=passwd, sub_db_name=sub_db_name, verbose=verbose, server=server) for _ in range(num_proc) ] for gs in getseqs: gs.start() for item in id_list: idents.put( BioSeqLookupCascade(id_type=id_type, identifier=item, verbose=verbose, indent=indent)) for i in range(num_proc): idents.put(None) while num_jobs: temp = results.get() print(temp, indent=indent) temp[1].name = temp[0] seqdict[temp[0]] = temp[1] num_jobs -= 1 if verbose: print('Done with biosql_get_record_mp!', indent=indent) print('Closing processes!', indent=indent) for gs in getseqs: if gs.is_alive(): gs.join() itemsnotfound = [i for i in id_list if i not in seqdict.keys()] return seqdict, itemsnotfound