Example #1
0
def nr_by_longest(handle, filetype='fasta', write=True):
    oldlist = SeqIO.parse(handle, filetype)
    seqdict = {}

    for seq in oldlist:
        if seq.seq == 'Sequenceunavailable':
            print('Seq Unavailable:\t', seq.name)
            continue
        try:
            seq.id, seq.description = seq.id.split('|')[0], seq.id.split('|')[1]
        except IndexError:
            seq.id, seq.description = seq.id.split(' ')[0], ''.join(seq.id.split('|')[1:len(seq.id.split('|'))])
        assert seq.id != 'gi' or seq.id != 'emb' or seq.id != 'acc'
        if seq.id in seqdict:
            if len(seq) > len(seqdict[seq.id]):
                seqdict[seq.id] = seq
            else:
                continue
        else:
            seqdict[seq.id] = seq
    newlist = (seq for _, seq in seqdict.items())
    if write:
        outhandle = 'nr_' + str(Path(handle).name)
        with Path(outhandle).open('w') as outf:
            SeqIO.write(newlist, outf, filetype)
    return newlist
Example #2
0
 def blast_prep(search_type, database, species, verbose, indent, db_loc):
     if database == 'auto' or database == 'auto-transcript':
         if verbose > 1:
             print('Blast type set to auto!', indent=indent)
         try:
             blast_db = get_searchdb(search_type=search_type,
                                     species=species,
                                     db_loc=db_loc,
                                     verbose=verbose,
                                     indent=indent + 1)
         except Exception:
             raise SearchError(
                 'No BLAST database was found for species {}!'.format(
                     species))
     elif isinstance(database, dict):
         try:
             blast_db = database[species]
             if verbose > 1:
                 print('Using {} as BLAST database!'.format(blast_db),
                       indent=indent)
         except KeyError:
             raise SearchError(
                 'No BLAST database was found for species {}!'.format(
                     species))
     elif isinstance(database, str) or isinstance(database, Path):
         blast_db = database
     else:
         raise SearchError('Invalid type given for database!')
     return blast_db
Example #3
0
 def blat_prep(database_port, species, verbose, indent):
     if isinstance(database_port, dict):
         try:
             blat_port = database_port[species]
             if verbose > 1:
                 print('Using port {0} for gfServer of species {1}.'.format(
                     blat_port, species),
                       indent=indent)
         except KeyError:
             raise SearchError(
                 'No 2bit found for species {}!'.format(species))
     elif isinstance(database_port, int):
         blat_port = database_port
     elif isinstance(database_port, str):
         try:
             blat_port = int(database_port)
         except ValueError:
             raise SearchError(
                 'Invalid option "{}" was passed to database_port! database_port must be '
                 'either a dictionary of species-port pairs or an integer!'.
                 format(database_port))
     else:
         raise SearchError(
             'Invalid option of type "{}" was passed to database_port! database_port must be '
             'either a dictionary of species-port pairs or an '
             'integer!'.format(str(type(database_port))))
     return blat_port
Example #4
0
    def twobit(id_full, id_item, database, indent, verbose):
        seq = None
        itemsnotfound = None

        command = [
            "twoBitToFa", '{0}:{1}'.format(database, id_full), '/dev/stdout'
        ]
        if verbose > 1:
            print('Command:', indent=indent)
            print(' '.join(command), indent=indent + 1)
        twobittofa_handle = subprocess.check_output(command,
                                                    universal_newlines=True,
                                                    stdin=subprocess.PIPE,
                                                    stderr=subprocess.PIPE)
        if verbose > 1:
            print("TwoBitToFa type: ", type(twobittofa_handle), indent=indent)
        if type(twobittofa_handle) is str:
            seq_out = twobittofa_handle
        else:
            seq_out, seq_err = twobittofa_handle
            raise FetchSeqError(seq_err)

        if seq_out is not None:
            if verbose:
                print('Got sequence for ', id_full, indent=indent)
            if verbose > 3:
                print(str(seq_out).replace('\n', '\n' + '\t' * (indent + 1)),
                      indent=indent + 1)
            with StringIO(seq_out) as output:
                seq = SeqIO.read(output, 'fasta')
        else:
            itemsnotfound = id_item
        return seq, itemsnotfound
Example #5
0
 def run(self):
     while True:
         next_task = self.task_queue.get()
         if next_task is None:
             if self.verbose:
                 print('\tAll GetSeq tasks in this process are complete')
             self.task_queue.task_done()
             break
         answer = next_task(server=self.server,
                            sub_db_name=self.sub_db_name)
         self.task_queue.task_done()
         self.result_queue.put(answer)
Example #6
0
def cull_reciprocal_best_hit(recblast_out):
    """
    returns a recblast_out container that only has the reciprocal best hits.
    :param recblast_out:
    :return:
    """
    pat = re.compile('\|\[(.*?)\]\|')  # regex for items in annotation
    if isinstance(recblast_out, list):
        rc_out_list = []
        for index, rc in enumerate(recblast_out):
            rc_out_list.append(cull_reciprocal_best_hit(rc))
        return rc_out_list
    else:
        # assert isinstance(recblast_out, RecBlastContainer), "Items must be RecBlastContainer Objects!"
        for species, rc_spec_rec in recblast_out.items():
            # print('Species:\t', species, indent=0)
            for query, rc_rec in rc_spec_rec.items():
                # print('Query:\t', query, indent=1)
                try:
                    rc_out = rc_rec['recblast_results']
                except KeyError:
                    print('No entries in recblast_results for query {0} in species {1}'.format(query, species))
                    continue
                tmprecord = []
                for record in rc_out:
                    try:
                        # print(record.description, indent=3)
                        target_id, annotations = record.description.split('|-|')
                        # print('Target ID:\t', target_id, indent=4)
                        # print('Annotations:', annotations.lstrip('\t'), indent=4)
                    except ValueError:
                        print(record.description, indent=2)
                        print('Could not unpack annotations!', indent=2)
                        continue
                    id_lst = pat.findall(annotations)
                    # print('id_list:\t', id_lst, indent=4)
                    if id_lst:
                        if query in id_lst[0]:
                            tmprecord.append(record)
                        else:
                            print("For query {0}, target {1} was not a reciprocal best hit!".format(query,
                                                                                                    target_id))
                            continue
                    else:
                        print('No annotations found for record {0} in species {1}, query {2}'.format(record.name,
                                                                                                     species,
                                                                                                     query))
                        continue
                recblast_out[species][query]['recblast_results'] = tmprecord
        return recblast_out
Example #7
0
 def __init__(self, file):
     transtab = str.maketrans('!@#$%^&*();:.,\'\"/\\?<>|[]{}-=+', '_____________________________')
     if isinstance(file, str):
         self.file = Path(file)
         assert self.file.exists(), file + ' is an invalid file path or does not exist!'
         assert self.file.is_file(), file + ' is not a valid file!'
     elif isinstance(file, Path):
         assert self.file.exists(), str(file) + ' is an invalid file path or does not exist!'
         assert self.file.is_file(), str(file) + ' is not a valid file!'
     else:
         raise TypeError('File must be either a str or Path object!')
     self.regions = []
     with self.file.open() as f:
         for index, line in enumerate(f):
             line = line.strip()
             if index == 0:
                 self.header = line.lstrip('# ')
                 continue
             elif line.startswith('# region='):
                 region = line.lstrip('# region=').translate(transtab)
                 if getattr(self, region, None) is None:
                     self.regions.append(region)
                     setattr(self, region, [])
                 continue
             elif line.startswith('#') and not line.startswith('# '):
                 cnames = line.lstrip('#').translate(transtab)
                 ColNames = namedtuple('ColNames', cnames.split('\t'))
                 self.colnames = ColNames._fields
                 continue
             elif line.startswith('# No data'):
                 newitem = getattr(self, region, []) + [ColNames(*[None] * len(self.colnames))]
                 setattr(self, region, newitem)
                 continue
             else:
                 try:
                     newitem = getattr(self, region, []) + [ColNames(*line.split('\t'))]
                     setattr(self, region, newitem)
                 except NameError as err:
                     raise NameError(str(err) + '\nParser encountered a line of data before either the column names '
                                                'or the genomic region was declared in the file!')
                 except TypeError:
                     print(line, file=sys.stderr)
                     raise
                 continue
Example #8
0
 def run(self):
     while True:
         fs_instance = self.id_queue.get()
         if fs_instance is None:
             self.id_queue.task_done()
             print('All FetchSeqs in Queue completed!', indent=self.indent)
             break
         try:
             id_item, seq, miss_items = fs_instance(
                 passwd=self.passwd,
                 id_type=self.id_type,
                 driver=self.driver,
                 user=self.user,
                 host=self.host,
                 database=self.database,
                 database_path=self.database_path,
                 delim=self.delim,
                 server=self.server,
                 version=self.version,
                 add_length=self.add_length,
                 species=self.species,
                 source=self.source,
                 verbose=self.verbose,
                 n_threads=self.n_subthreads,
                 indent=self.indent)
         except Exception as err:
             print('FetchSeq Error!')
             print(type(err), err)
             id_item, seq = ('', '')
             miss_items = []
         self.id_queue.task_done()
         self.seq_out_queue.put(((id_item, seq), miss_items))
     return
Example #9
0
def format_range(seqrange, strand, addlength, indent, verbose):
    assert isinstance(
        addlength, tuple), "addlength was of type {}, must be a tuple!".format(
            type(addlength))
    assert len(
        addlength
    ) == 2, "addlength must be a tuple of length 2! Received: {}".format(
        addlength)
    try:
        lextend = -int(addlength[0])
        rextend = int(addlength[1])
    except Exception as err:
        print(type(err), err)
        lextend = 0
        rextend = 0
    try:
        lrange = int(seqrange[0])
        rrange = int(seqrange[1])
    except Exception as err:
        print(type(err), err)
        lrange = 0
        rrange = -1
    if verbose > 1:
        print('Original range: {0}-{1}{2}'.format(lrange, rrange, strand),
              indent=indent)
        print(
            'Adding {0} steps to the beginning and {1} steps to the end of the sequence!'
            .format(lextend, rextend),
            indent=indent)
    if lrange > rrange:
        strand = '+' if strand == '-' else '-'
        lrange = seqrange[1]
        rrange = seqrange[0]
    newrange = tuple(
        map(lambda x, y: int(x) + y, (lrange, rrange), (lextend, rextend)))
    if verbose > 2:
        print('New range: {0}-{1}{2}'.format(lrange, rrange, strand),
              indent=indent)
    return newrange, strand
Example #10
0
def drop_overlaps_bed(bedfile):
    d = bedfile if isinstance(bedfile, dict) else read_bed(bedfile, key_col=slice(0, 3))
    d_new = []
    dlocs = {}
    for loc in d.keys():
        if loc[0] in dlocs.keys():
            dlocs[loc[0]].append([int(loc[1]), int(loc[2]), loc[3]])
        else:
            dlocs[loc[0]] = [[int(loc[1]), int(loc[2]), loc[3]]]
    for k, v in dlocs.items():
        if len(v) > 1:
            v = [sorted(i[0:2]) + [i[2]] for i in v]
            # comparison matrix
            t = [[max(v[i][0], j[0]) <= min(v[i][1], j[1]) for j in v] for i in range(0, len(v))]
            # set diagonal identities to False
            for index in range(0, len(t)):
                t[index][index] = False
            # sum per column of matrix
            t_sums = [sum(i) for i in zip(*t)]
            # Select only items which have a zero in the t_sums index
            filtered_v = [v[i] for i in range(0, len(t_sums)) if t_sums[i] == 0]
            d_new += [(k, i[0], i[1], i[2]) for i in filtered_v]
        else:
            try:
                v = v[0]
                d_new.append((k, v[0], v[1], v[2]))
            except Exception:
                print(k, v)
                raise
    filtered_d = {}
    for item in d_new:
        if item in d.keys():
            filtered_d[item] = d[item]
        elif (item[0], item[2], item[1]) in d.keys():
            filtered_d[(item[0], item[2], item[1])] = d[(item[0], item[2], item[1])]
        else:
            print(item)
            raise Exception
    return filtered_d
Example #11
0
    def fun(self, hit, stat, verbose=False):

        pat = re.compile('\|\[(.*?):.*\]\|')  # regex for items in annotation
        try:
            hit_split = hit.description.split('|-|')
            top_anno = hit_split[1]
        except ValueError:
            print(hit.description, indent=2)
            print('Could not unpack annotations!', indent=2)
            return False
        except IndexError:
            print(hit.description, indent=2)
            print('Could not unpack annotations!', indent=2)
            return False
        id_lst = pat.findall(top_anno)[0].strip()
        if id_lst:
            _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=verbose)

            if stat == hit_symbol:
                return True
        else:
            return False
Example #12
0
def count_reciprocal_best_hits_from_pandas(pandas_df):
    pat = re.compile('\|\[(.*?)\]\|')  # regex for items in annotation
    spec_list = list(pandas_df.target_species.unique())
    species_counters = {}
    for species in spec_list:
        species_counters[species] = Counter()
        species_results = pandas_df.loc[pandas_df['target_species'] == species]
        query_list = list(species_results.query_name.unique())
        for query in query_list:
            print(query)
            query_results = species_results.loc[species_results['query_name'] == query].ix[:, 5:-1]
            rc_out = []
            for i, d in query_results.iterrows():
                rc_out += d.tolist()
            # Annoying shunt
            rc_out_asfasta = '\n'.join(['>' + i for i in rc_out if i is not None])
            tmp = StringIO(rc_out_asfasta)
            rc_out = SeqIO.parse(tmp, 'fasta')
            for hit in rc_out:
                try:
                    hit_split = hit.description.split('|-|')
                    id_lst = ''.join(pat.findall(hit_split[1]))
                except ValueError:
                    print(hit.description, indent=2)
                    print('Could not unpack annotations!', indent=2)
                    continue
                if id_lst:
                    _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0)

                else:
                    print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name,
                                                                                                 species,
                                                                                                 query))
                    continue
                if query == hit_symbol:
                    species_counters[species].update({query: 1})
    return species_counters
Example #13
0
def bed_extract_duplicates(bedfile, outfile="", verbose=False):
    bedfile = Path(bedfile)
    assert bedfile.exists(), "Given bedfile path does not exist!"
    assert bedfile.is_file(), "Given bedfile path was not a file! Did you provide a directory?"
    bed_dict = read_bed(bedfile)
    hits = sorted(bed_dict.keys())
    counts = Counter((''.join(hit.split("_")[:-1]) for hit in hits))
    duphits = (hit for hit in hits if counts[hit.split("_")[0]] > 1)
    outfile = Path(outfile) if outfile else bedfile.with_suffix(".bed.dups")
    try:
        first = next(duphits)
        if verbose:
            print(first, "\t", counts[first.split("_")[0]])
        with outfile.open("w") as of:
            of.write("\t".join((str(i) for i in bed_dict[first])) + "\n")
            for hit in duphits:
                if verbose:
                    print(hit, "\t", counts[hit.split("_")[0]])
                of.write("\t".join((str(i) for i in bed_dict[hit])) + "\n")
    except StopIteration:
        if verbose:
            print("No duplicates found in file!")
Example #14
0
    def __call__(self, sub_db_name, server):
        if self.verbose:
            print('\tFetching sequence: ', self.identifier, indent=self.indent)
        try:
            dtbase = server[sub_db_name]
        except KeyError as err:
            print('Woah! KeyError!', err, indent=self.indent)
            print(
                'Waiting for 0.1 second and rerunning in case it was a collision!',
                indent=self.indent)
            sleep(0.1)
            try:
                dtbase = server[sub_db_name]
            except KeyError:
                raise

        seqrec = biosql_seq_lookup_cascade(dtbase=dtbase,
                                           sub_db_name=sub_db_name,
                                           id_type=self.id_type,
                                           indent=self.indent,
                                           identifier=self.identifier,
                                           verbose=self.verbose)
        return self.identifier, seqrec
Example #15
0
 def __call__(self,
              seq_record,
              species,
              database,
              database_path,
              local,
              indent,
              perc_ident,
              verbose,
              database_port=None,
              expect=None,
              megablast=True,
              n_threads=1,
              write=False,
              filetype=None,
              **kwargs):
     # query_length = len(seq_record)
     if isinstance(database, Path):
         return self.load(database)
     elif isinstance(database, str) and database != 'stop':
         return self.load(Path(database))
     elif database == 'stop':
         raise StopRecBlast()
     elif self.search_type in [
             "blastn", "blastp", "blastx", "tblastx", "tblastn"
     ]:
         if verbose > 1:
             print(self.search_type, 'was selected.', indent=indent)
         dt = self.blast_prep(search_type=self.search_type,
                              db_loc=database_path,
                              database=database,
                              species=species,
                              verbose=verbose,
                              indent=indent)
         return self.blast_run(seq_record=seq_record,
                               species=species,
                               database=dt.name,
                               filetype=filetype,
                               blast_type=self.search_type,
                               local_blast=local,
                               expect=expect,
                               megablast=megablast,
                               use_index=False,
                               perc_ident=perc_ident,
                               verbose=verbose,
                               indent=indent,
                               n_threads=n_threads,
                               blastdb=database_path,
                               outtype=5,
                               return_raw=False,
                               **kwargs)
     elif self.search_type in [
             'blat', 'tblat', 'blat-transcript', 'tblat-transcript'
     ]:
         if verbose > 1:
             print(self.search_type, 'was selected.', indent=indent)
         port = self.blat_prep(database_port=database_port,
                               species=species,
                               verbose=verbose,
                               indent=indent)
         return self.blat_run(seq_record=seq_record,
                              local=local,
                              port=port,
                              filetype=filetype,
                              blat_type=self.search_type,
                              perc_ident=perc_ident,
                              verbose=verbose,
                              indent=indent,
                              blatdb=database_path,
                              outtype='pslx')
     else:
         raise SearchEngineNotImplementedError(
             'Invalid selection for search type!')
Example #16
0
def id_search(id_rec,
              id_type='brute',
              verbose=2,
              indent=0,
              custom_regex=None,
              regex_only=False):
    """

    EX:
    gi =
    refseq_accession = 'XP_010883249.1'
    scaffold = 'scaffold_145\t[:1033526-1034566](-)\t190
    id =
    chr = 'chrX[:3047971-3259961](-)119'
    seq_range =
    assembly1 = 'KN678312.1	[:9787-29116](+)	478'
    assembly2 = 'KN678312.1	[:9787-29116](+)	478'
    symbol = 'TP53'
    symbol = 'INS [:259-568](+) (161)'
    strand = '+'

    :param id_rec:
    :param id_type:
    :param custom_regex:
    :param regex_only:
    :param verbose:
    :param indent:
    :return:
    """
    # Define the regex functions
    p = dict(
        gi=re.compile('(\Agi[| _:]+[0-9.]+)'
                      '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        accession=re.compile(
            '(\A[AXNYZ][MWRCPGTZ][| _:]+[0-9.]+|\Aref[| _:]+[0-9.]+)'
            '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        scaffold=re.compile('(\Ascaffold[| _:]+[0-9.]+)'
                            '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        id=re.compile('(\Aid[| _:]*[0-9.]+)'
                      '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        chr=re.compile('(\Achr[| _:]*[A-Za-z0-9.]+)'
                       '([| \t:_])??\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        assembly=re.compile('(\A[A-Za-z]+[0-9.]+)'
                            '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        assembly_broad=re.compile(
            '(\b[ALYB]+[0-9.]+)'
            '([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        symbol=re.compile(
            '(\A\S+)([| \t:_])?\[?(:?\d+-?\d+)?\]?([| \t:_])?(.*)'),
        seq_range=re.compile(':?(\d+)-(\d+)'),
        strand=re.compile('(\([-+0N]\))'),
        score=re.compile('\d\d*'))
    if custom_regex is not None:
        p = {'custom': custom_regex}
        id_type = 'custom'

    # Begin search:
    if verbose > 1:
        print('ID Loaded, performing regex search for identifiers...',
              indent=indent)
        print('ID type: ', id_type, indent=indent)
    if id_type == 'brute':
        for tmp_type in [
                'accession', 'gi', 'scaffold', 'id', 'chr', 'assembly',
                'assembly_broad', 'symbol'
        ]:
            if bool(p[tmp_type].findall(id_rec)):
                if verbose > 1:
                    print(
                        'Brute Force was set, tested strings for all pre-registered IDs.',
                        indent=indent)
                    print('ID was selected as type {0}!'.format(tmp_type),
                          indent=indent + 1)
                if regex_only:
                    return p[tmp_type]
                else:
                    return id_search(id_rec=id_rec,
                                     id_type=tmp_type,
                                     verbose=verbose,
                                     indent=indent)
        raise IDError(
            'Couldn\'t identify the id type of line: {}!'.format(id_rec))
    else:
        try:
            item_parts = p[id_type].findall(id_rec)[0]
            if verbose > 1:
                print(
                    'Successfully found {0}, compiling list!'.format(id_type),
                    indent=indent)
                print('Item:\t', '\t'.join(item_parts), indent=indent + 1)
        except IndexError:
            raise IDError(
                'Could not identify patterns in {0} with id_type={1}, '
                'is the id_search sequence correct?'.format(id_rec, id_type))
        try:

            item_parts = list(item_parts)
            item_parts[0] = item_parts[0] if not isinstance(
                item_parts[0], str) else ''.join(item_parts[0])

            if item_parts[2]:
                try:
                    sr_tuple = p['seq_range'].findall(item_parts[2])[0]
                    if verbose > 1:
                        print('Found sequence delimiters in IDs!',
                              indent=indent)
                        print(sr_tuple, indent=indent + 1)
                except IndexError:
                    raise IDError(
                        'A positive match for a sequence range was found '
                        '({0}), yet no hits were identified! Confirm that '
                        'the regex is correct and try again!'.format(
                            item_parts[2]))
            else:
                sr_tuple = (0, -1)
            if item_parts[4]:
                try:
                    strand = p['strand'].findall(item_parts[4])[0]
                except IndexError:
                    strand = '(N)'
                try:
                    score = p['score'].findall(item_parts[4])[0]
                except IndexError:
                    score = 0
            else:
                strand = '(N)'
                score = '0'
            if verbose > 1:
                if strand != '(N)':
                    print('Strand info found: {0}'.format(strand),
                          indent=indent)
                if score != '0':
                    print('Score info found: {0}'.format(score), indent=indent)

            seq_range = (int(sr_tuple[0]), int(sr_tuple[1]), strand,
                         int(score))
            return p, item_parts[0], seq_range, id_type

        except IndexError:
            raise IDError(
                'Could not identify patterns in {0} with id_type={1}, '
                'is the id_search sequence correct?'.format(id_rec, id_type))
Example #17
0
def get_searchdb(search_type, species, db_loc, verbose=1, indent=0):
    """Finds and returns the appropriate search database for the given species and search type.

    This function automates the process of selecting the search database needed by the selected search program,
    like BLAST or BLAT, so that the user does not need to preoccupy themselves with providing said information
    for a large number of species. For BLAST* that depend on protein databases (BLASTP and BLASTX), the function
    searches for files matching the form 'Genus_species_protein.*' in the given directory; for BLAST* that depend
    on DNA databases (BLASTN, TBLASTN, and TBLASTX), it instead looks for files 'Genus_species_genome.*'.
    If '-transcript' is added to the end of any of the DNA-dependent BLAST*, then instead the function will
    search for files in the style of 'Genus_species_transcript.*'. In the case of BLAT searches, the program will
    similarly search for 'Genus_species*.2bit', or for 'Genus_species*transcript.2bit' if '-transcript' is added
    after the search type.
    In all usage cases, if the program does not find files matching the 'Genus_species' format, it will try to
    find the files using a case-insensitive search using the 6-letter abbreviated form of the species name.

    Usage::
    >>> get_searchdb('blastp', 'H**o sapiens', '/path/to/search/files')
    /path/to/search/files/Homo_Sapiens_protein.*
    >>> get_searchdb('tblastn', 'H**o sapiens', '/path/to/search/files')
    /path/to/search/files/HomSap_genome.*
    >>> get_searchdb('blastn-transcript', 'H**o sapiens', '/path/to/search/files')
    /path/to/search/files/HomSap_transcript.*
    >>> get_searchdb('blat', 'H**o sapiens', '/path/to/search/files')
    /path/to/search/files/HomSap.2bit
    >>> get_searchdb('blat-transcript', 'H**o sapiens', '/path/to/search/files')
    /path/to/search/files/HomSap_transcript.2bit

    Arguments::
    :param str search_type: The name of the search method (blast or blat, and sub-type: blastp, blastn, blat, tblat...)
    :param str species: Name of species associated with the database. If there is a space, it will be replaced with an
    underscore.
    :param str db_loc: Path to folder containing collection of search databases.
    :param int verbose: How verbose should the output be. Zero suppresses all output, 2 is max verbosity.
    :param int indent: Indent level for printed output.
    :return str:  Path to the identified search database.
    """
    if verbose:
        print('Search DB set to auto, choosing search_db...', indent=indent)
    species = species.replace(' ', '_')
    if verbose > 1:
        print('Search DB location set to: ', db_loc, indent=indent)
    db_type_dict = {
        'blastx': "protein",
        'blastp': "protein",
        'blastn': "genome",
        'tblastn': "genome",
        'tblastx': "genome",
        'blastn-transcript': "transcript",
        'tblastn-transcript': "transcript",
        'tblastx-transcript': "transcript",
        'blat': "blat",
        'tblat': "blat",
        'blat-transcript': 'blat-transcript',
        'tblat-transcript': 'tblat-transcript'
    }
    try:
        db_type = db_type_dict[search_type]
    except KeyError:
        print('Unable to determine search db type!', indent=indent)
        raise SearchError(
            'Improper search type given ({})!'.format(search_type))
    if verbose > 1:
        print('DB type: ', db_type, indent=indent)
    db_path = Path(db_loc).absolute()
    if not db_path.exists():
        db_path = Path(db_loc)
    if db_path.exists() and db_path.is_dir():
        if db_type == 'blat':
            glob_path = [
                i for i in db_path.glob('{0}*.2bit'.format(
                    species.replace(' ', '_')))
            ]  # Todo: generalize extension
        elif db_type in ['blat-transcript', 'tblat-transcript']:
            glob_path = [
                i for i in db_path.glob('{0}*transcript.2bit'.format(
                    species.replace(' ', '_')))
            ]
        else:
            glob_path = [
                i for i in db_path.glob('{0}_{1}*'.format(
                    species.replace(' ', '_'), db_type))
            ]
        if not glob_path:
            if verbose:
                print(
                    'No DB found! Trying again with abbreviated species name',
                    indent=indent)
            species_abbv = ''.join(
                [i[0:3] for i in species.title().split('_')])
            # making it insensitive to case for Glob
            species_abbv_insensitive = ''.join([
                '[{0}{1}]'.format(c.lower(), c.upper()) for c in species_abbv
                if c.isalpha()
            ])
            if verbose:
                print('Abbreviated species name: ',
                      species_abbv,
                      indent=indent)
                print('RegEx species abbreviation: ',
                      species_abbv_insensitive,
                      indent=indent)
            if db_type == 'blat':
                glob_path = [
                    i for i in db_path.glob('{0}*.2bit'.format(
                        species_abbv_insensitive))
                ]
            elif db_type in ['blat-transcript', 'tblat-transcript']:
                glob_path = [
                    i for i in db_path.glob('{0}*transcript.2bit'.format(
                        species_abbv_insensitive))
                ]
            else:
                glob_path = [
                    i for i in db_path.glob('{0}_{1}*'.format(
                        species_abbv_insensitive, db_type))
                ]
        try:
            if verbose:
                print(glob_path, indent=indent)
            if isinstance(glob_path, list):
                search_db = sorted(glob_path, reverse=True)[0]
            else:
                search_db = glob_path
        except IndexError:
            print('WARNING: COULD NOT FIND DATABASE! ABORTING!', indent=indent)
            raise DatabaseNotFoundError('', 'No databases were found!')
    else:
        raise DatabaseNotFoundError('DB_Path {} does not exist!'.format(
            str(db_path)))
    if verbose:
        print('{0} DB chosen: {1}'.format(search_type, str(search_db)),
              indent=indent)
    return search_db
Example #18
0
def blat_server(twobit,
                order='start',
                host='localhost',
                port=20000,
                type='blat',
                log='/dev/null',
                species=None,
                search_db_loc='/usr/db/blat',
                verbose=1,
                indent=0,
                try_limit=10,
                **kwargs):
    """Convenience function that controls a gfServer. Still in alpha.

    This function serves as a python wrapper for the Bash gfServer command. The user can either provide a .2bit file,
    or else can provide a species and set 'twobit="auto"' to have the function use 'get_searchdb()' to find a .2bit file
    automatically. By default, the function is set to start up a new gfServer instance, but using the 'order' parameter,
    the user can execute any of the standard gfServer commands such as 'stop' and 'status'.
    To start a gfServer, the function first probes the selected port (default is 20000) to ensure its unused; if it is
    currently in use, the program then goes port-by-port in ascending order until it finds an empty port to use for the
    server. Then, it simply calls the gfServer command with all the keyword arguments required, as well as with any
    extra arguments provided by the user.

    Usage::
    >>>blat_server(twobit='hg38.2bit', port=20000, verbose=3)
    gfServer start localhost 20001 -canStop -stepSize=5 hg38.2bit
    # Waits 30 seconds, then starts calling 'gfServer status localhost 20001' every 30 seconds for 5 minutes
    # If at any point 'gfServer status' returns something that is not an error or "Couldn't connect...", it
    # returns the port where the server was opened.
    20001
    >>>blat_server(twobit='auto', port=20000, species='H**o sapiens', verbose=3)
    # Calls get_searchdb('blat', 'H**o sapiens', db_loc=BLATDB)
    # Internally, will return a .2bit file such as 'Homo_sapiens.2bit'
    20001
    >>>blat_server(twobit='hg38.2bit', port=20000, order='status', verbose=3)
    # If the server is active:
    1
    >>>blat_server(twobit='hg38.2bit', port=20000, order='status', verbose=3)
    # If the server either has not been started or is not yet active:
    0
    >>>blat_server(twobit='hg38.2bit', port=20000, order='status', verbose=3)
    # If the server returns an error
    Exception(...)


    :param str twobit: A path to the .2bit file to be used for the server. Can also be set to 'auto'.
    :param str order: A command for gfServer. Can be one of the following: start, stop, status, files, query (requires
    a nucleotide sequence in fasta format), protQuery (requires a protein sequence in fasta format), transQuery
    (requires a nucleotide sequence in fasta format), pcr (requires arguments fPrimer, rPrimer, maxDistance), direct
    (requires probe.fa, file(s).nib), or pcrDirect (requires fPrimer, rPrimer, file(s).nib).
    :param str host: Address at which to host the server.
    :param int port: Port number that will be assigned to server. If in use, will test new port number in increments of
    1 until a free port is found.
    :param str type: Type of server to be hosted. 'blat' will start a DNA server, 'tblat' will start a DNAX server for
    protein queries.
    :param str log: Path and name of log file to be written.
    :param str species: Species name that get_searchdb() will use to find .2bit file when twobit='auto'.
    :param str search_db_loc: Path to the folder containing .2bit file.
    :param int verbose: Level of verbosity of function output. 0 suppresses all output, 3 is max verbosity.
    :param int indent: Indentation level of print output.
    :param int try_limit: Number of tries at 30-second intervals that function should probe the gfServer before timeout.
    :param kwargs: keyword arguments to be passed on to gfServer.
    :return: if order='start', returns the port of the new gfServer; if order='status', returns 0 if there was no
    connection, or 1 if the server is active and responding.
    """
    # Regular: gfServer start localhost portX -stepSize=5 -log=untrans.log database.2bit
    # Prot>DNAX:  gfServer start localhost portY -trans -mask -log=trans.log database.2bit
    gfserver_suppl_args = list()
    if twobit == 'auto' and order != 'stop':
        if verbose:
            print('2bit set to auto: searching for 2bit file for species ',
                  species,
                  indent=indent)
        twobit = get_searchdb(search_type='blat',
                              species=species,
                              db_loc=search_db_loc,
                              verbose=verbose,
                              indent=indent + 1)
        if twobit.exists() and twobit.is_file():
            twobit = twobit.name
        else:
            raise BLATServerError('Invalid 2bit file!')
    for key, item in kwargs.items():
        if key == 'order':
            order = item
        elif key == 'host':
            host = item
        elif key == 'port':
            port = item
        else:
            gfserver_suppl_args.append('-{0}={1}'.format(key, item))
    if order == 'status':
        gfcheck = subprocess.Popen('gfServer status {0} {1}'.format(
            str(host), str(port)),
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT,
                                   universal_newlines=True,
                                   shell=True,
                                   executable='/bin/bash')
        out, _ = gfcheck.communicate()
        if "couldn't connect to localhost" in out.lower():
            return 0
        elif "error" in out.lower():
            raise BLATServerError(out)
        else:
            return 1
    elif order == 'stop':
        subprocess.check_call('gfServer stop {0} {1}'.format(
            str(host), str(port)),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.STDOUT,
                              universal_newlines=True,
                              shell=True,
                              executable='/bin/bash')
        return
    else:
        print(order)
        # Todo: make the portsniffer its own function and make sure it works properly.
        portfinder = subprocess.check_output(
            '/home/manny/Scripts/oneshot/checkifportisopen.sh {}'.format(
                str(port)),
            universal_newlines=True,
            shell=True,
            executable='/bin/bash')
        port = portfinder.rstrip()

        gfserver_cmd = [
            'gfServer',
            str(order),
            str(host),
            str(port), '-canStop'
        ]
        if type == 'blat':
            gfserver_cmd.append('-stepSize=5')
        elif type == 'tblat':
            gfserver_cmd += ['-trans', '-mask']
        if gfserver_suppl_args:
            gfserver_cmd += gfserver_suppl_args
        gfserver_cmd_str = ' '.join(gfserver_cmd + [twobit])
        if verbose > 2:
            print(gfserver_cmd_str, indent=indent)
        subprocess.Popen(gfserver_cmd_str,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         universal_newlines=True,
                         shell=True,
                         executable='/bin/bash')
        tries = 0
        while tries <= try_limit:
            sleep(30)
            gfcheck = subprocess.Popen('gfServer status {0} {1}'.format(
                str(host), str(port)),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT,
                                       universal_newlines=True,
                                       shell=True,
                                       executable='/bin/bash')
            out, _ = gfcheck.communicate()
            if verbose > 2:
                print(out)
            if "couldn't connect to localhost" in out.lower():
                tries += 1
            elif "error" in out.lower():
                raise BLATServerError(out)
            else:
                if verbose:
                    print(out)
                return port
        if tries > try_limit:
            raise TimeoutError('Timed out!')
Example #19
0
def id_ranker(record,
              perc_score,
              perc_query_span,
              perc_ident,
              expect=None,
              indent=0,
              verbose=1,
              same_strand=True,
              return_only=None):
    """Filters results based on score, expectation value, length, percent identity, and span; returns a sorted list.

    :param query_record record: Either a SearchIO.QueryResult or a Bio.Blast.Record.
    :param float perc_score: Minimum percentage of top score for a hit.
    :param float expect: Maximum e-value for a hit (BLAST-only).
    :param float perc_query_span: Minimum percent of the longest hit by query coverage for a hit.
    :param int perc_ident: Minimum percent identity of a hit.
    :param int indent: Indent level for pretty print. [Default: 0]
    :param int verbose: Level of verbose output? [Default: 1]
    :param bool same_strand: Should the function filter hits with HSPs on different strands? [Default:True]
    :param return_only: Should all or only one id be returned?
    :return list: Returns a list of tuples containing the final hit data in BED6 format.  
    """
    id_list = []
    if verbose:
        print('Beginning ID_Ranker...', indent=indent)
    if record.program == 'blat':
        if verbose > 2:
            print('Results obtained from BLAT run.', indent=indent + 1)
    elif 'blast' in record.program:
        if verbose > 2:
            print('Results obtained from BLAST run.', indent=indent + 1)
    else:
        raise NotImplementedError('Sorry, your program {} is not yet '
                                  'implemented for RecBlast!'.format(
                                      record.program))

    # Create filter functions:
    def hsp_minscores(hsp):
        return hsp.score >= int(perc_score * top_score)

    def hsp_min_query_span(hsp):
        return hsp.query_span >= perc_query_span * top_length

    def hsp_perc_ident(hsp):
        return hsp.ident_pct >= perc_ident

    def hsp_same_strand(hsp):
        if same_strand:
            return all(
                [i == hsp.hit_strand_all[0] for i in hsp.hit_strand_all])
        else:
            return True

    def hit_sort_scores(hit):
        return sum([hsp.score for hsp in hit.hsps])

    def hsp_sort_scores(hsp):
        return hsp.score

    # Get top stats:
    top_score = max([max([hsp.score for hsp in hit.hsps]) for hit in record])
    if verbose > 1:
        print('Top score for {}:\t'.format(record.id),
              top_score,
              indent=indent)
    top_length = max([max([hsp.query_span for hsp in hit]) for hit in record])
    if verbose > 1:
        print('Longest hit for {}:\t'.format(record.id),
              top_length,
              indent=indent)

    if verbose > 2:
        print("ALL HITS STATS:")
        print('|\tHit Name:\t|\t# HSPs\t|\tScore:\t|\tLength:\t|\tP.Ident\t|')
        print("==========================================================")
        for hit in record:
            name = hit.id
            n_hsp = len(hit.hsps)
            print('|\t{HitName}\t|\t{HSP}\t|'.format(HitName=name, HSP=n_hsp))
            print("------------------------------------------------------")
            for hsp in hit:
                print(
                    '|\t{id}\t|\t{hf}\t|\t{score}\t|\t{length}\t|\t{ident}\t|'.
                    format(id=hsp.hit_id,
                           hf=len(hsp),
                           score=hsp.score,
                           length=hsp.hit_span,
                           ident=hsp.ident_pct))

    # Execute filters:
    # query_span
    if verbose > 1:
        print('Number of HSPs for {}:\t'.format(record.id),
              sum([len(i.hsps) for i in record]),
              indent=indent)
        print('Filtering out all HSPs shorter than {}...'.format(
            perc_query_span * top_length),
              indent=indent)
    record = record.hsp_filter(
        hsp_min_query_span) if perc_query_span else record
    if not record:
        text = (
            'No hits in Query Results match a stretch of the query sequence longer than '
            '{0}!').format((top_length * perc_query_span))
        raise NoHitsError(text)
    # Score
    if verbose > 1:
        print('Number of HSPs for {}:\t'.format(record.id),
              sum([len(i.hsps) for i in record]),
              indent=indent)
        print('Filtering out all HSPs with scores less than {}...'.format(
            top_score * perc_score),
              indent=indent)
    record = record.hsp_filter(hsp_minscores) if perc_score else record
    if not record:
        text = 'No hits in Query Results have a score above the minimum of {0}!'.format(
            (top_score * perc_score))
        raise NoHitsError(text)
    if verbose > 1:
        print('Number of HSPs for {}:\t'.format(record.id),
              sum([len(i.hsps) for i in record]),
              indent=indent)
        print(
            'Filtering out all HSPs with percent identity below {}...'.format(
                perc_ident),
            indent=indent)
    record = record.hsp_filter(hsp_perc_ident) if perc_ident else record
    if not record:
        text = 'No hits in Query Results have a percent identity above {}%!'.format(
            round(perc_ident * 100, 2))
        raise NoHitsError(text)
    if verbose > 1:
        print('Number of HSPs for {}:\t'.format(record.id),
              sum([len(i.hsps) for i in record]),
              indent=indent)
        if same_strand:
            print(
                'Filtering out all HSPs that have fragments on opposite strands...'
            )
    record = record.hsp_filter(hsp_same_strand) if same_strand else record
    if not record:
        text = 'No hits in Query Results are on the same strand!'
        raise NoHitsError(text)
    # Sorting them for good measure
    if verbose > 1:
        print('Sorting all hits by descending scores!', indent=indent)
    record.sort(key=hit_sort_scores, reverse=True, in_place=True)
    for hit in record:
        hit.sort(key=hsp_sort_scores, reverse=True, in_place=True)
    if verbose > 1:
        print('Done!', indent=indent)
    # Add items to id_list
    # Big note: think in HSPs, not Hits
    n = 1
    for hit in record:
        for hsp in hit:
            # some quick strand math:
            if hsp._has_hit_strand:
                strands = set(hsp.hit_strand_all)
                if len(strands) == 1:
                    strand = "+" if strands == {1} else "-"
                else:
                    strand = "."
            else:
                strand = "."
            if verbose > 2:
                print("Adding hit {chr}:{s}-{e}({st}) to id list".format(
                    chr=hsp.hit_id,
                    s=str(hsp.hit_range[0]),
                    e=str(hsp.hit_range[1]),
                    st=strand),
                      indent=indent)
            # A little witchcraft before we do though
            # turns out hsp.hit_start_all won't necessarily start with the starting point of the hit...
            # That means we need to zip hit_start_all and hit_span_all, sort by the first one, then de-zip.
            block_starts, block_spans = zip(*sorted(
                zip(hsp.hit_start_all, hsp.hit_span_all), key=itemgetter(0)))

            # chr (start,end) id score strand thickStart thickEnd rgb blockcount blockspans blockstarts query_span
            id_list.append([
                hsp.hit_id, hsp.hit_range, hsp.query_id, hsp.score, strand,
                hsp.hit_range[0], hsp.hit_range[1], "255,0,0",
                len(hsp.hit_start_all), ",".join([str(i)
                                                  for i in block_spans]),
                ",".join([str(i - hsp.hit_range[0])
                          for i in block_starts]), hsp.query_range
            ])

            if return_only and n == return_only:
                print('Returning only the top {} hits, ending here!'.format(
                    return_only),
                      indent=indent)
                return id_list
            n += 1
    return id_list
Example #20
0
def count_reciprocal_best_hits(recblast_out):
    pat = re.compile('\|\[(.*?)\]\|')  # regex for items in annotation
    species_counters = {}
    for species, species_dict in recblast_out.items():
        species_counters[species] = Counter()
        for query, query_dict in species_dict.items():
            try:
                rc_out = query_dict['recblast_results']
            except KeyError:
                print('No entries in recblast_results for query {0} in species {1}'.format(query, species))
                continue
            for hit in rc_out:
                try:
                    hit_split = hit.description.split('|-|')
                    target_id = hit_split[0]
                    annotations = hit_split[1]
                except ValueError:
                    print(hit.description, indent=2)
                    print('Could not unpack annotations!', indent=2)
                    continue
                except IndexError:
                    print(hit.description, indent=2)
                    print('Could not unpack annotations!', indent=2)
                    continue
                id_lst = ''.join(pat.findall(annotations))
                if id_lst:
                    _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0)

                else:
                    print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name,
                                                                                                 species,
                                                                                                 query))
                    continue

                if query == hit_symbol:
                    species_counters[species].update({query: 1})
    return species_counters
Example #21
0
    def blast_run(seq_record,
                  species,
                  database,
                  blast_type,
                  filetype="fasta",
                  local_blast=False,
                  expect=0.005,
                  megablast=True,
                  use_index=False,
                  perc_ident=75,
                  verbose=True,
                  indent=0,
                  n_threads=1,
                  blastdb='/usr/db/blastdb/',
                  outtype=5,
                  return_raw=False,
                  **kwargs):
        """A wrapper function for BLAST searches.
        :param seq_record: The record containing the query sequence for the search. Can be either a SeqIO.SeqRecord or
                           a string with the file loaction.
        :param str species: The species whose sequence database will be queried.
        :param Union[dict, str, Path] database: The name of the database to be used in the search.
        :param str blast_type: Type of BLAST search being performed
        :param str filetype: Filetype of seq_record (if seq_record is a SeqRecord object, leave as default.
                             [default: 'fasta']
        :param bool local_blast: Should the search be conducted locally or on remote servers? (BLAT searches are always
                                 local.) [Default: False]
        :param float expect: Highest expect value of BLAST results to be returned. [Default: 0.005]
        :param bool megablast: Should MegaBLAST be used for nucleotide searches? [Default: True]
        :param bool use_index: Should BLAST use indexes associated with the database files? [Default: False]
        :param int perc_ident: Minimum percent identity required of results to be returned [Default: 75]
        :param bool verbose: Verbose output? [Default: True]
        :param int indent: Indent level for pretty print. [Default: 0]
        :param int n_threads: Number of threads to allocate for BLAST [Default: 1]
        :param str blastdb: Path of databases for either BLAST or BLAT. [Default: '/usr/db/blastdb'
        :param int outtype: Output type. (see options for BLAST and BLAT) [Default: pslx]
        :param bool return_raw: Return raw output rather than processed BioBlastRecord? [Default: False]
        :param kwargs: Additional keyword arguments to pass on to BLAST/BLAT.
        :return: blast_record, blast_err
        """

        if isinstance(seq_record, SeqIO.SeqRecord):
            pass
        else:
            seq_record = SeqIO.read(seq_record, filetype)
        args = dict()
        if verbose:
            print("Now starting BLAST...", indent=indent)
        if local_blast:
            # build up the BLAST arguments:
            args.update({
                '-db': str(database),
                '-evalue': expect,
                '-outfmt': str(outtype),
                '-num_threads': n_threads
            })
            if blast_type == 'blastn':
                if megablast:
                    args['-task'] = 'megablast'
                if use_index:
                    args['-use_index'] = use_index
                args['-perc_identity'] = perc_ident
            args_expanded = list()
            [(args_expanded.append(j), args_expanded.append(k))
             for j, k in args.items()]
            if verbose:
                print('Running BLAST locally...', indent=indent)
                print('Options:', indent=indent)
                print(args_expanded, indent=indent + 1)
            if blast_type in [
                    "blastn", "blastp", "blastx", "tblastx", "tblastn"
            ]:
                blast_cline = [blast_type] + args_expanded
                try:
                    blast_handle = subprocess.check_output(
                        [str(i) for i in blast_cline],
                        input=seq_record.format('fasta'),
                        universal_newlines=True,
                        cwd=blastdb)
                    if isinstance(blast_handle, str):
                        blast_result = blast_handle
                        blast_err = None
                    else:
                        blast_result, blast_err = blast_handle

                except subprocess.CalledProcessError:
                    raise
            else:
                raise SearchError("Invalid blast choice!")
        else:
            args.update(
                dict(program=str(blast_type),
                     database=str(database),
                     sequence=seq_record.format('fasta'),
                     entrez_query='"{}"[ORGN]'.format(species),
                     expect=expect,
                     perc_ident=perc_ident))
            if megablast & (blast_type == 'blastn'):
                args['megablast'] = 'True'
            if kwargs:
                args.update(**kwargs)
            if verbose:
                print('Submitting Remote BLAST! Options passed:',
                      indent=indent)
                for k, v in args.items():
                    print('{0}\t=\t{1}'.format(k, v), indent=indent + 1)
            try:
                blast_result = NCBIWWW.qblast(**args)
                blast_err = None
            except Exception as err:
                print(type(err), err)
                raise err

        if verbose:
            print('Done with Blast!', indent=indent)
        if return_raw:
            return blast_result, blast_err
        else:
            if isinstance(blast_result, StringIO):
                blast_record = NCBIXML.read(blast_result)
            else:
                try:
                    with StringIO(''.join(blast_result)) as fin:
                        blast_record = NCBIXML.read(fin)
                except Exception as err:
                    print('Error reading Blast Results! Aborting!',
                          indent=indent)
                    print('Error details:\n', err, indent=indent)
                    raise err
            return blast_record, blast_err
Example #22
0
def count_dups(recblast_out):
    """ Inverts target-annotation dictionary to find out, for every best-hit annotation, how many targets there are"""
    species_anno_target_dict = {}
    species_anno_count_dict = {}
    master_dict = simple_struct(recblast_out, verbose=False)

    for species, species_dict in master_dict.items():
        try:
            anno_target_dict = species_anno_target_dict[species]
        except KeyError:
            species_anno_target_dict[species] = {}
            anno_target_dict = species_anno_target_dict[species]
        print(species_dict, indent=0)
        for query, query_dict in species_dict.items():
            # ignoring query
            print(query_dict, indent=1)
            for target_id, annotation_list in query_dict.items():
                print(annotation_list, indent=2)
                tophit = annotation_list[0]
                print(tophit, indent=2)
                try:
                    anno_target_dict[tophit] += [target_id]
                except KeyError:
                    anno_target_dict[tophit] = list()
                    anno_target_dict[tophit].append(target_id)
                print(anno_target_dict[tophit], indent=3)
    for species, anno_dict in species_anno_target_dict.items():
        print(species, indent=0)
        try:
            anno_count_dict = species_anno_count_dict[species]
        except KeyError:
            species_anno_count_dict[species] = {}
            anno_count_dict = species_anno_count_dict[species]
        for annotation, target_list in anno_dict.items():
            print(annotation, '\t\t\t', len(target_list))
            anno_count_dict[annotation] = len(target_list)
    return species_anno_target_dict, species_anno_count_dict
Example #23
0
def simple_struct(recblast_out, verbose=True):
    """Returns a nice diagram of queries, targets, and annotations"""
    master_dict = {}
    pat = re.compile('\|\[(.*?)\]\|')  # regex for items in annotation
    if isinstance(recblast_out, list):
        # Prepare a list of dictionaries of length recblast_out, along with a list of respective species
        master_count = [dict] * len(recblast_out)

        for index, rc in enumerate(recblast_out):
            try:
                master_count[index] = simple_struct(rc)
            except AttributeError:
                master_count[index] = rc
        for subdict in master_count:
            for species, species_dict in subdict.items():
                if isinstance(species_dict, Exception):
                    continue
                try:
                    comb_spec_dict = master_dict[species]
                except KeyError:
                    master_dict[species] = dict()
                    comb_spec_dict = master_dict[species]
                for query, query_dict in species_dict.items():
                    try:
                        comb_query_dict = comb_spec_dict[query]
                    except KeyError:
                        comb_spec_dict[query] = dict()
                        comb_query_dict = comb_spec_dict[query]
                    for target_id, annotation_list in query_dict.items():
                        try:
                            comb_anno_list = comb_query_dict[target_id]
                        except KeyError:
                            comb_query_dict[target_id] = list()
                            comb_anno_list = comb_query_dict[target_id]
                        comb_anno_list += annotation_list if isinstance(annotation_list, list) else [annotation_list]
        return master_dict

    else:
        """
        Structure:
            master_dict:
                Species|    species_dict:
                                Query|  query_dict:
                                            target_id|  annotations_list
        """
        # assert isinstance(recblast_out, RecBlastContainer), 'Item in recblast_out was not a RecBlastContainer object!'
        try:
            recblast_out.__delitem__('__dict__')
        except KeyError:
            pass
        for species, rc_spec_rec in recblast_out.items():
            # print('Species:\t', species, indent=0)
            try:
                species_dict = master_dict[species]
            except KeyError:
                master_dict[species] = dict()
                species_dict = master_dict[species]
            for query, rc_rec in rc_spec_rec.items():
                # print('Query:\t', query, indent=1)
                try:
                    query_dict = species_dict[query]
                except KeyError:
                    species_dict[query] = dict()
                    query_dict = species_dict[query]
                try:
                    rc_out = rc_rec['recblast_results']
                except KeyError:
                    print('No entries in recblast_results for query {0} in species {1}'.format(query, species))
                    continue
                for record in rc_out:
                    try:
                        # print(record.description, indent=3)
                        target_id, annotations = record.description.split('|-|')
                        # print('Target ID:\t', target_id, indent=4)
                        # print('Annotations:', annotations.lstrip('\t'), indent=4)
                    except ValueError:
                        print(record.description, indent=2)
                        # print('Could not unpack annotations!', indent=2)
                        continue
                    try:
                        target_list = query_dict[target_id]
                    except KeyError:
                        query_dict[target_id] = list()
                        target_list = query_dict[target_id]
                    id_lst = pat.findall(annotations)
                    # print('id_list:\t', id_lst, indent=4)
                    if id_lst:
                        target_list += id_lst
                    else:
                        print('No annotations found for record {0} in species {1}, query {2}'.format(record.name,
                                                                                                     species,
                                                                                                     query))
        if verbose:
            print('*******************************************')
            for species, species_dict in master_dict.items():
                print(species, indent=0)
                for query, query_dict in species_dict.items():
                    print(query, indent=1)
                    for target_id, annotation_list in query_dict.items():
                        print(target_id, indent=2)
                        tmp = []
                        for annotation in annotation_list:
                            p, item, seq_range, id_type = id_search(annotation, id_type='brute', verbose=0)
                            if id_type == 'symbol':
                                tmp.append(item)
                            else:
                                tmp.append(item)
                        query_dict[target_id] = tmp
                        for annotation in query_dict[target_id]:
                            print(annotation, indent=3)
            print('*******************************************')
        return master_dict
Example #24
0
def biosql_seq_lookup_cascade(dtbase,
                              sub_db_name,
                              id_type,
                              identifier,
                              indent=0,
                              verbose=False):
    seqrec = SeqRecord(seq='')
    try_get_id = True
    if id_type == 'scaffold':
        lookup_key = 'name'
    else:
        lookup_key = id_type
    if try_get_id:
        try:
            if verbose:
                print("\t\tNow searching database {0} for {1}: {2}".format(
                    sub_db_name, id_type, identifier),
                      indent=indent)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(**{lookup_key: identifier}))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except KeyError:
            sleep(0.1)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(**{lookup_key: identifier}))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except IndexError as err:
            if verbose:
                print(
                    "WARNING: couldn't find {0} using given ID type... \n Full error: {1}"
                    .format(identifier, err),
                    indent=indent)

    if try_get_id:
        identifier_sans_subnumber = identifier.split('.')[0]
        if verbose:
            print(
                '\t\tSeeing if removing any sub-numbers (acc: xxxxxx.1 for example) helps...',
                indent=indent)
            print('\t\tIdentifier: ', identifier_sans_subnumber, indent=indent)
        try:
            if verbose:
                print("\t\tNow searching database {0} for {1}: {2}".format(
                    sub_db_name, id_type, identifier_sans_subnumber),
                      indent=indent)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(**{lookup_key: identifier_sans_subnumber}))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except KeyError:
            sleep(0.1)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(**{lookup_key: identifier_sans_subnumber}))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except IndexError as err1:
            if verbose:
                print(
                    "WARNING: couldn't find {0} using abbreviated ID... \n Full error: {1}"
                    .format(identifier_sans_subnumber, err1),
                    indent=indent)
    if try_get_id:
        try:
            if verbose:
                print(
                    '\t\tAttempting to search using Primary ID instead of declared type:',
                    indent=indent)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(primary_id=identifier))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except KeyError:
            sleep(0.1)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(primary_id=identifier))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except IndexError as err2:
            if verbose:
                print(
                    "WARNING: couldn't find {0} using Primary ID... \n full error: {1}"
                    .format(identifier, err2),
                    indent=indent)
    if try_get_id:
        try:
            if verbose:
                print(
                    '\t\tAttempting to search using name instead of declared type:',
                    indent=indent)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(name=identifier))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except KeyError:
            sleep(0.1)
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(name=identifier))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
            try_get_id = False
        except IndexError as err3:
            if verbose:
                print(
                    "WARNING: Still couldn't find {0} using name search: \n full error: {1}"
                    .format(identifier, err3),
                    indent=indent)

    if try_get_id:
        try:
            lookup_key = input(
                'Last shot, chose an ID type: '
                '[accession, primary_id, gi, version, display_id, name]')
            if lookup_key == 'exit':
                exit(exit())
            seqrec = biosql_dbseqrecord_to_seqrecord(
                dtbase.lookup(**{lookup_key: identifier}))
            if verbose:
                print('\tGot sequence for {}!'.format(identifier),
                      indent=indent)
        except IndexError as err5:
            if verbose:
                print(
                    "WARNING: COULD NOT FIND SEQUENCES FOR ID:{0}: \n full error: {1}"
                    .format(identifier, err5),
                    indent=indent)
    return seqrec
Example #25
0
def fetchseq(ids,
             species,
             write=False,
             output_name='',
             delim='\t',
             id_type='brute',
             server=None,
             source="SQL",
             database="bioseqdb",
             database_path=None,
             host='localhost',
             driver='psycopg2',
             version='1.0',
             user='******',
             passwd='',
             email='',
             batch_size=50,
             output_type="fasta",
             verbose=1,
             n_threads=1,
             n_subthreads=1,
             add_length=(0, 0),
             indent=0):
    if isgenerator(ids):
        if verbose > 1:
            print('Received generator!', indent=indent)
    elif isinstance(ids, list):
        if verbose > 1:
            print('Received list!', indent=indent)
    else:
        if verbose > 1:
            print('Reading ID File... ', indent=indent)
        with ids.open('w') as in_handle:
            id_prelist = [line.strip() for line in in_handle
                          ]  # list of each line in the file
            print('Done!', indent=indent)
        ids = [id_item for id_item in filter(None, id_prelist) if id_item]
        if not id_prelist or id_prelist is None:
            if verbose:
                print('id_prelist is empty!', indent=indent)
            return 'None'
    for id_item in ids:
        assert len(id_item) == 12, (
            "Item {0} in id_list has {1} items, not 5!\n"
            "Format should be: "
            "chr, (start,end), id, score, strand, thickStart, thickEnd, rgb, blockcount,"
            " blockspans, blockstarts, query_span"
            "!").format(
                " ".join((" ".join(item) if not isinstance(item, str) else item
                          for item in id_item)), len(id_item))
    if verbose > 1:
        print('Readied ids!', indent=indent)

    id_list = multiprocessing.JoinableQueue()
    results = multiprocessing.Queue()

    if 'sql' in source.lower():
        if server is None:
            try:
                if verbose > 1:
                    print('No server received, opening server...',
                          indent=indent)
                server = BioSeqDatabase.open_database(driver=driver,
                                                      user=user,
                                                      passwd=passwd,
                                                      host=host,
                                                      database=database)
                if verbose > 1:
                    print('Done!', indent=indent)
            except Exception as err:
                if verbose > 1:
                    print('Failed to open server!', indent=indent)
                    print(str(type(err)), err, sep=' ', indent=indent)
                raise
        else:
            if verbose > 1:
                print('Received server handle:', indent=indent)
                print(server, indent=indent)
            if verbose > 2:
                print('Please note the sub_databases of server:\n\t',
                      [str(i) for i in server.keys()],
                      indent=indent)
    elif source.lower() in ['fasta', '2bit', 'twobit']:
        print('Search type: ', source, indent=indent)
    else:
        raise SearchEngineNotImplementedError(
            'Search using source {} has not yet been implemented!'.format(
                source))
    if verbose > 1:
        print('Creating FecSeq Processes...', indent=indent)
    fs_instances = [
        FetchSeqMP(id_queue=id_list,
                   seq_out_queue=results,
                   delim=delim,
                   id_type=id_type,
                   server=server,
                   species=species,
                   source=source,
                   database=database,
                   database_path=database_path,
                   host=host,
                   driver=driver,
                   version=version,
                   user=user,
                   passwd=passwd,
                   email=email,
                   output_type=output_type,
                   batch_size=batch_size,
                   verbose=verbose,
                   n_subthreads=n_subthreads,
                   add_length=add_length,
                   indent=indent + 1) for _ in range(n_threads)
    ]
    if verbose > 1:
        print('Done! Starting processes...', indent=indent)
    for fs in fs_instances:
        fs.start()
    if verbose > 1:
        print('Done!', indent=indent)
        print('Assigning FetchSeq records to queue... ', indent=indent)
    id_order = []
    for i, id_rec in enumerate(ids):
        try:
            id_order.append("{0}:{1}-{2}".format(id_rec[0], id_rec[1][0],
                                                 id_rec[1][1]))
        except IndexError:
            id_order.append("{0}".format(id_rec[0]))
        try:
            id_list.put(FetchSeq(id_rec=id_rec))
        except AssertionError as err:
            print(i, type(err), err, sep=' ')
            break
    for _ in fs_instances:
        id_list.put(None)
    if verbose > 1:
        print('Done!', indent=indent)
    output_dict = dict()
    missing_items_list = list()
    if verbose > 1:
        print('Getting sequences from processes... ', indent=indent)
    n_jobs = len(ids)
    while n_jobs:
        seq, missing = results.get()
        output_dict[seq[0]] = seq[1]
        missing_items_list.append(missing)
        n_jobs -= 1
    if verbose > 1:
        print('Done! Finished fetching sequences!', indent=indent)
        print('Closing processes!', indent=indent)
    for fs in fs_instances:
        if fs.is_alive():
            fs.join()
    output_list = [output_dict[i] for i in id_order if i in output_dict]
    if write:
        SeqIO.write(output_list, output_name, output_type)
        return
    else:
        if missing_items_list == [None]:
            missing_items_list = None
        return output_list, missing_items_list
Example #26
0
 def sql(id_item, seq_range, source, species, id_type, user, host, passwd,
         database, n_threads, version, server, indent, verbose):
     driver = "mysql" if source.lower() == 'mysql' else "psycopg2"
     if verbose > 1:
         print('Searching for sequences in local SQL database...',
               indent=indent)
     if verbose > 2:
         print('Please note the sub_databases of server:\n\t',
               [str(i) for i in server.keys()],
               indent=indent)
     if version.lower() == 'auto':
         sub_db_list = []
         sub_db_name = ''.join([i[0:3] for i in species.title().split(' ')])
         for sub_db in server.keys():
             if sub_db_name in sub_db:
                 sub_db_list.append(sub_db)
         if len(sub_db_list) < 1:
             raise NameError('sub_db does not exist!')
         elif len(sub_db_list) == 1:
             sub_db_name = sub_db_list[0]
         else:
             if verbose:
                 print('Multiple database versions found!', indent=indent)
                 print(sub_db_list, indent=indent)
                 print('Selecting highest DB', indent=indent)
             sub_db_name = sorted(sub_db_list, reverse=True)[0]
         if verbose:
             print('Sub-DB chosen was ', sub_db_name, indent=indent)
     else:
         sub_db_name = ''.join([i[0:3] for i in species.title().split(' ')
                                ]) + version
     id_list_search = id_item
     try:
         seq_dict, itemnotfound = biosql_get_record(sub_db_name=sub_db_name,
                                                    passwd=passwd,
                                                    id_list=id_list_search,
                                                    id_type=id_type,
                                                    driver=driver,
                                                    user=user,
                                                    host=host,
                                                    database=database,
                                                    num_proc=n_threads,
                                                    server=server,
                                                    verbose=True)
     except Exception as err:
         print('Please note the sub_databases of server:\n\t',
               [str(i) for i in server.keys()],
               indent=indent)
         raise err
     seq_ids = list(seq_dict.keys())
     assert len(
         seq_ids
     ) == 1, 'Multiple sequences were returned for a single query!'
     seq = seq_dict[seq_ids[0]]
     seq = seq[slice(seq_range[0], seq_range[1])]
     return seq, itemnotfound
Example #27
0
    def __call__(self,
                 delim,
                 species,
                 version,
                 source,
                 passwd,
                 id_type,
                 driver,
                 user,
                 host,
                 database,
                 n_threads,
                 server,
                 verbose,
                 add_length,
                 indent,
                 database_path=None):
        if isinstance(database, dict):
            if species in database:
                database = database[species]
            else:
                raise DatabaseNotFoundError(
                    'No sequence source database for species {} '
                    'was found in the provided dict!'.format(species))
        elif database == "auto" and source in ["2bit", "twobit", "blastdb"]:
            database = get_searchdb(search_type=source,
                                    species=species,
                                    db_loc=database_path,
                                    verbose=verbose,
                                    indent=indent + 1).name
        if database_path:
            database = database_path.rstrip("/") + '/' + database
        if verbose > 1:
            print('Full header for Entry:', indent=indent)
            print(self.id_rec, indent=indent)
        (item_chr, seq_range, item_name, score, strand, thickStart, thickEnd,
         rgb, blockcount, blockspans, blockstarts,
         query_coverage) = self.id_rec

        try:
            if verbose > 1:
                print('Seq range: ', seq_range, indent=indent)
            assert len(
                seq_range) == 2, 'Seq_range returned a tuple of length != 2!!!'
            old_strand = strand
            if add_length != (0, 0):
                seq_range, strand = format_range(seqrange=seq_range,
                                                 strand=strand,
                                                 addlength=add_length,
                                                 indent=indent + 1,
                                                 verbose=verbose)
                self.id_rec[1] = seq_range
                self.id_rec[4] = strand
            if -1 in seq_range[0:2]:
                id_full = '{0}'.format(item_chr)
            else:
                id_full = '{0}:{1}-{2}'.format(item_chr, seq_range[0],
                                               seq_range[1])
        except KeyError:
            raise KeyError(
                'Sequence {0} lacks a seq_range entry!!!'.format(item_chr))
        if verbose:
            print('ID for query:\t', id_full, indent=indent)

        # Armed with the ID list, we fetch the sequences from the appropriate source

        if source.lower() == "entrez":
            seq, itemnotfound = self.entrez(item_chr, seq_range, indent,
                                            add_length, verbose)
        elif source.lower() in ["postgresql", "mysql"]:
            seq, itemnotfound = self.sql(id_item=item_chr,
                                         seq_range=seq_range,
                                         source=source,
                                         species=species,
                                         id_type=id_type,
                                         user=user,
                                         host=host,
                                         passwd=passwd,
                                         database=database,
                                         n_threads=n_threads,
                                         version=version,
                                         server=server,
                                         indent=indent,
                                         verbose=verbose)
        elif source == "fasta":  # Note: anecdotally, this doesn't run terribly fast - try to avoid.
            seq, itemnotfound = self.fasta(id_item=item_chr,
                                           seq_range=seq_range,
                                           database=database,
                                           source=source,
                                           indent=indent,
                                           verbose=verbose)
        elif source in ["2bit", "twobit"]:
            seq, itemnotfound = self.twobit(id_full=id_full,
                                            id_item=item_chr,
                                            database=database,
                                            indent=indent,
                                            verbose=verbose)
        else:
            raise DatabaseNotFoundError(
                'Not a valid database source: {}'.format(source))
        if itemnotfound is not None:
            if verbose > 1:
                print('Some items were not found:', indent=indent)
                print(itemnotfound, indent=indent)
        if old_strand != strand:
            if verbose > 1:
                print('Sequence was inverted! Reverse complementing now...',
                      indent=indent)
            seq.seq = seq.seq.reverse_complement()
            if verbose > 1:
                print('Done!', indent=indent)
        seq.features.append(SeqFeature.SeqFeature(type='duplicate'))
        if strand == '+':
            s = 1
        elif strand == '-':
            s = -1
        else:
            s = "."
        seq.features[0].location = SeqFeature.FeatureLocation(
            int(seq_range[0]), int(seq_range[1]), strand=s)
        seq.features[0].qualifiers['score'] = score
        seq.features[0].qualifiers['query_coverage'] = query_coverage
        seq.features[0].qualifiers['thickStart'] = thickStart
        seq.features[0].qualifiers['thickEnd'] = thickEnd
        seq.features[0].qualifiers['itemRGB'] = rgb
        seq.features[0].qualifiers['blockCount'] = blockcount
        seq.features[0].qualifiers['blockSizes'] = blockspans
        seq.features[0].qualifiers['blockStarts'] = blockstarts

        seq.name = item_chr
        return id_full, seq, itemnotfound
Example #28
0
    def blat_run(seq_record,
                 port,
                 local="localhost",
                 filetype="fasta",
                 blat_type='blat',
                 perc_ident=None,
                 verbose=True,
                 indent=0,
                 blatdb='/usr/db/blastdb/',
                 outtype='pslx'):
        """A wrapper function for BLAT searches.
        :param seq_record: The record containing the query sequence for the search. Can be either a SeqIO.SeqRecord or
                           a string with the file loaction.
        :param int port: Port of the gfServer to be queried
        :param str local: Host address.
        :param str filetype: Filetype of seq_record (if seq_record is a SeqRecord object, leave as default.
                             [default: 'fasta']
        :param str blat_type: Type of search to conduct. Can be a BLAST type (blastn, blastp, blastx, tblastn, tblastx) 
                              or a BLAT type (blat, tblat). [Default: 'blastn']
        :param int perc_ident: Minimum percent identity required of results to be returned [Default: 75]
        :param bool verbose: Verbose output? [Default: True]
        :param int indent: Indent level for pretty print. [Default: 0]
        :param str blatdb: Path of databases for either BLAST or BLAT. [Default: '/usr/db/blastdb'
        :param str outtype: Output type. (see options for BLAST and BLAT) [Default: pslx]
        :return: blat_record, blat_err
        """
        if isinstance(seq_record, SeqIO.SeqRecord):
            pass
        elif isinstance(seq_record, str):
            seq_record = SeqIO.read(seq_record, filetype)
        else:
            raise TypeError(
                'seq_record was of type {}, must be either '
                'a str with filepath or a SeqRecord object!'.format(
                    type(seq_record)))

        if verbose:
            print("Now starting BLAT...", indent=indent)
        if verbose > 1:
            print('Search Type: ', blat_type, indent=indent)

        args_expanded = [
            'gfClient', local,
            str(port), '/', '/dev/stdin', '/dev/stdout'
        ]
        args_expanded += ['-t=dnax', '-q=prot'
                          ] if blat_type.lower() == 'tblat' else []
        args_expanded += [
            'minIdentity={}'.format(perc_ident if perc_ident else 0),
            '-out={}'.format(outtype)
        ]

        try:
            if verbose > 1:
                print('BLAT command:', indent=indent)
                print(' '.join(args_expanded), indent=indent + 1)

            blat = subprocess.Popen(args_expanded,
                                    stdout=subprocess.PIPE,
                                    universal_newlines=True,
                                    cwd=blatdb,
                                    stdin=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

            blat_raw, blat_raw_err = blat.communicate(
                input=seq_record.format('fasta'))
            if blat_raw_err:
                raise SearchError(blat_raw_err)

            head = subprocess.Popen(["head", "-n", "-1"],
                                    universal_newlines=True,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE)

            blat_handle = head.communicate(input=blat_raw)
            if verbose > 2:
                print(blat_handle[0], indent=indent)
            if verbose:
                print('Done!', indent=indent)
            if isinstance(blat_handle, str):
                blat_result = blat_handle
                blat_err = None
            else:
                blat_result, blat_err = blat_handle
        except subprocess.CalledProcessError:
            raise
        blat_result, blast_err = blat_result, blat_err
        blat_record = None
        with StringIO(blat_result) as fin:
            try:
                if outtype == 'pslx':
                    blat_record = SearchIO.read(fin,
                                                format='blat-psl',
                                                pslx=True)
                elif outtype == 'psl':
                    blat_record = SearchIO.read(fin, format='blat-psl')
                elif outtype == 'blast8':
                    blat_record = SearchIO.read(fin, format='blast-tab')
                elif outtype == 'blast9':
                    blat_record = SearchIO.read(fin,
                                                format='blast-tab',
                                                comments=True)
                elif outtype == 'blast':
                    blat_record = SearchIO.read(fin, format='blast-xml')
                else:
                    raise SearchError('Invalid out type')
            except ValueError:
                if verbose:
                    print(
                        'No Query Results were found in handle for seq_record {}!'
                        .format(seq_record.id))
                raise NoHitsError(
                    'No Query Results were found in handle for seq_record {}!'.
                    format(seq_record.id))
            except Exception as err:
                print('Error reading BLAT results! Aborting!')
                print('Error details:\n')
                raise err
        return blat_record, blat_err
Example #29
0
def biosql_get_record(id_list,
                      sub_db_name,
                      passwd='',
                      id_type='accession',
                      driver="psycopg2",
                      indent=0,
                      user="******",
                      host="localhost",
                      database="bioseqdb",
                      num_proc=2,
                      verbose=True,
                      server=None):
    """

    :param sub_db_name:
    :param passwd:
    :param id_list:
    :param id_type:
    :param driver:
    :param indent:
    :param user:
    :param host:
    :param database:
    :param num_proc:
    :param verbose:
    :param server:
    :return:
    if __name__ == '__main__':
        biosql_get_record(sub_db_name='MyoLuc2.0', passwd='',
                             id_list=['NW_005871148', 'NW_005871300', 'NW_005871148'], id_type='accession',
                             driver="psycopg2", user="******",
                             host="localhost", database="bioseqdb", verbose=True)
    """
    idents = multiprocessing.JoinableQueue()
    results = multiprocessing.Queue()

    # num = multiprocessing.cpu_count() * 2
    if verbose > 2:
        print('\tStarting biosql_get_record_mp', indent=indent)
    id_list = id_list if isinstance(id_list, list) else [id_list]
    num_jobs = len(id_list)
    seqdict = dict()
    getseqs = [
        GetSeqMP(idents,
                 results,
                 database=database,
                 host=host,
                 driver=driver,
                 user=user,
                 passwd=passwd,
                 sub_db_name=sub_db_name,
                 verbose=verbose,
                 server=server) for _ in range(num_proc)
    ]
    for gs in getseqs:
        gs.start()

    for item in id_list:
        idents.put(
            BioSeqLookupCascade(id_type=id_type,
                                identifier=item,
                                verbose=verbose,
                                indent=indent))

    for i in range(num_proc):
        idents.put(None)

    while num_jobs:
        temp = results.get()
        print(temp, indent=indent)
        temp[1].name = temp[0]
        seqdict[temp[0]] = temp[1]
        num_jobs -= 1
    if verbose:
        print('Done with biosql_get_record_mp!', indent=indent)
        print('Closing processes!', indent=indent)
    for gs in getseqs:
        if gs.is_alive():
            gs.join()

    itemsnotfound = [i for i in id_list if i not in seqdict.keys()]

    return seqdict, itemsnotfound