def setup_bot(bot): # Argument Handling bot.debug = any("debug" in arg.lower() for arg in sys.argv) # Logging init_sentry(bot) bot.sentry = sentry discord_log = logging.getLogger("discord") discord_log.setLevel(logging.CRITICAL if not bot.debug else logging.INFO) log = logging.getLogger("bot") bot.log = log log.info(f"\n{get_banner()}\nLoading....") # Load modules bot.session = aiohttp.ClientSession(loop=bot.loop) bot.load_extension("modules.Events.Ready") # Database credentials = { "user": os.getenv("POSTGRES_USER"), "password": os.getenv("POSTGRES_PASS"), "database": os.getenv("POSTGRES_DATABASE"), "host": os.getenv("POSTGRES_HOST"), "init": init_connection } bot.pool = bot.loop.run_until_complete(asyncpg.create_pool(**credentials)) bot.log.info( f"Postgres connected to database ({bot.pool._working_params.database})" f" under the ({bot.pool._working_params.user}) user") # Config bot.config = get_config bot.kclient = KClient(api_key=os.getenv("KSOFT")) bot.spotify = spotipy.Spotify( client_credentials_manager=SpotifyClientCredentials( client_id=os.getenv("SPOTIFY_ID"), client_secret=os.getenv("SPOTIFY_SECRET"))) bot.uptime = datetime.datetime.utcnow() bot.version = { "bot": bot.config()["version"], "python": sys.version.split(" ")[0], "discord.py": discord.__version__ } bot.counter = Counter() bot.commands_used = Counter() bot.process = psutil.Process() bot.color = bot.config()["colors"]["main"] bot.error_color = bot.config()["colors"]["error"]
def count_text(words): """ This function will count the words in the list given :param words: a list of words :return: a dict where the key is the word and the value is the frequency of the word """ return Counter(words)
def build_vocab(self, *args, **kwargs): """Add unaligned_token to the list of special symbols.""" counter = Counter() sources = [] for arg in args: # arg是QEDataset类,里面包括examples和fields if isinstance(arg, data.Dataset): sources += [ getattr(arg, name) for name, field in arg.fields.items() if field is self ] # source是列表,列表中元素是迭代器 else: sources.append(arg) for sample in sources: for x in sample: # 每次循环读取一个样本,将样本处理成list形式,然后更新counter if not self.sequential: x = [x] try: counter.update(x) except TypeError: counter.update(chain.from_iterable(x)) specials = list( OrderedDict.fromkeys(tok for tok in [ self.unk_token, self.pad_token, self.init_token, self.eos_token, self.unaligned_token ] if tok is not None)) # ['<unk>', '<pad>', '<unaligned>'] self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
def build_vocab(self, *args, **kwargs): """Add unaligned_token to the list of special symbols.""" counter = Counter() sources = [] for arg in args: if isinstance(arg, data.Dataset): sources += [ getattr(arg, name) for name, field in arg.fields.items() if field is self ] else: sources.append(arg) for sample in sources: for x in sample: if not self.sequential: x = [x] try: counter.update(x) except TypeError: counter.update(chain.from_iterable(x)) specials = list( OrderedDict.fromkeys(tok for tok in [ self.unk_token, self.pad_token, self.init_token, self.eos_token, self.unaligned_token, ] if tok is not None)) self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
def reset(self): self.tok2id = dict() self.id2tok = dict() self._add_special_tokens() self._set_special_token_ids() self.vocab = Counter()
def get_product_count_dict_from_prokka(ffn_file: Path) -> dict: """ Reads Prokka fasta file and returns a dict of counts of each product name extracted from the headers """ product_list = [] with open(str(ffn_file)) as f: for line in f.readlines(): if line.startswith(">"): line = line.strip() product = get_product_from_prokka_fasta_header(line) product_list.append(product) product_count_dict = dict(Counter(product_list)) return product_count_dict
def get_seperate_list(raw_result): """ seperate a n-dimension list into n list with group by each item ex: gen_seperate_list([[1,'b'],[2,'a']]) >>> [1,2],['b','a'] :param raw_result: :return: """ #get raw_result's item's count count = Counter(raw_result[0]).__len__() all = Counter(raw_result).__len__() i = 0 j = 0 while j < count: new_list = [] while i < all: new_list.append(raw_result[i][j]) i += 1 yield (new_list) i = 0 j += 1
def load_from_vocab_file(self, file): """ Load vocabulary from a .vocab file """ self.tok2id = dict() self.id2tok = dict() self._add_special_tokens() self._set_special_token_ids() self.vocab = Counter() for line in open(file, encoding="utf-8").readlines(): token, count = line.split("\t") self.vocab[token] = float(count) self.add_token(token)
def __init__(self, pad="<pad>", sos="<sos>", eos="<eos>", unk="<unk>", oovs=0): self.PAD = pad self.SOS = sos self.EOS = eos self.UNK = unk self.oovs = oovs self.vocab = Counter() self.tok2id = dict() self.id2tok = dict() self.size = 0 self.subword = None
def __init__(self, pad="<pad>", sos="<sos>", eos="<eos>", unk="<unk>", oovs=0, file=None, preprocess=None, subword=None, lang=None): self.PAD = pad self.SOS = sos self.EOS = eos self.UNK = unk self.PAD_id = 0 self.SOS_id = None self.EOS_id = None self.UNK_id = None self.lang = lang self.oovs = oovs self.vocab = Counter() self.tok2id = dict() self.id2tok = dict() self.freqs = dict() self.gpt2_tokenizer = None self.is_gpt2 = False self.subword = subword if file is not None: assert preprocess is not None, "Need preprocess() to build vocab!" self.build(file, preprocess)
def fit_vocab( self, samples, vocab_size=None, vocab_min_freq=0, embeddings_name=None, keep_rare_words_with_embeddings=False, add_embeddings_vocab=False, ): tokens = Counter() for sample in samples: # TODO: subtokenize? tokens.update(self.tokenize(sample)) # We use our own Vocabulary class specials = list( OrderedDict.fromkeys( tok for tok in [self.unaligned_token] if tok is not None ) ) # TODO: handle embeddings/vectors self.vocab = Vocabulary( tokens, max_size=vocab_size, min_freq=vocab_min_freq, unk_token=self.unk_token, pad_token=self.pad_token, bos_token=self.bos_token, eos_token=self.eos_token, specials=specials, specials_first=self.specials_first, # TODO: missing vectors, etc. vectors=None, rare_with_vectors=keep_rare_words_with_embeddings, add_vectors_vocab=add_embeddings_vocab, )
def setup_bot(bot): bot.config = bot.loop.run_until_complete(get_config()) bot.remove_command('help') discord_log = logging.getLogger('discord') log = logging.getLogger("main") discord_log.setLevel(logging.CRITICAL) bot.log, bot.logger, = log, log log.info(f"{get_icon()}\nLoading....\n") bot.formatter = EmbededHelp() bot.debug = any('debug' in arg.lower() for arg in sys.argv) bot.uptime = datetime.datetime.utcnow() bot.commands_used = Counter() bot.messages_sent = 0 bot.process = psutil.Process() bot.at_everyone_seen = 0 bot.session = aiohttp.ClientSession(loop=bot.loop) bot.keys = bot.loop.run_until_complete(get_keys()) bot.config = bot.loop.run_until_complete(get_config()) bot.conn = bot.loop.run_until_complete( r.connect("localhost", db=bot.config['db'], port=28015)) bot.loop.run_until_complete(module_loader(bot, "handlers")) bot.loop.run_until_complete(module_loader(bot, "commands")) if bot.debug: discord_log.setLevel(logging.INFO)
def populate_product_occurence_dictionary(self): """ Counts the number of times any product occurs in self.total_gene_list and stores in dict """ self.product_occurence_dictionary = Counter(self.total_product_list)
def get_member_counts(df: pd.DataFrame) -> dict: # Get a complete list of all cluster_rep values including duplicates, do total counts raw_cluster_rep_list = list(df['cluster_rep']) member_counts = Counter(raw_cluster_rep_list) return member_counts
def filter_core_cluster_tsv(cluster_tsv: Path, outdir: Path): """ Filters/repairs the core cluster .tsv output file to ensure there are no cases where a single sample is represented in a cluster more than once. Previous versions of MMSeqs2 would sometimes encounter minor issues with clustering, and this method attempts to fix this. TODO: Verify this is still necessary :param cluster_tsv: path to cluster .tsv file :param outdir: path desired output directory :return: path to filtered/repaired cluster .tsv """ # Read TSV df = cluster_tsv_to_df(cluster_tsv=cluster_tsv) # TODO: Refactor some of the logic below into clean functions # Get the cluster member dict cluster_member_dict = get_cluster_member_dict(df) # Determine which clusters contain duplicate samples, create a dictionary to store them to_repair_dict = {} for cluster_rep, members in cluster_member_dict.items(): sample_id_list = [] for m in members: sample_id = m.rsplit("_", 1)[0] sample_id_list.append(sample_id) counter_dict = Counter(sample_id_list) problematic_sample_id_dict = { x: counter_dict[x] for x in counter_dict if counter_dict[x] > 1 } if len(problematic_sample_id_dict) >= 1: log.debug( f"Warning: Detected problematic samples: {problematic_sample_id_dict}" ) to_repair_dict[cluster_rep] = problematic_sample_id_dict else: continue # Write a file containing all clusters/members flagged as problematic debug_dict = {} for cluster_rep, members in cluster_member_dict.items(): sample_id_list = [] for m in members: sample_id = m.rsplit("_", 1)[0] sample_id_list.append(sample_id) counter_dict = Counter(sample_id_list) problematic_sample_id_dict = { x: counter_dict[x] for x in counter_dict if counter_dict[x] > 1 } if len(problematic_sample_id_dict) >= 1: debug_list = [] for s in list(problematic_sample_id_dict.keys()): for m in members: if s in m: debug_list.append(m) debug_dict[cluster_rep] = debug_list debug_file = Path(outdir / 'master_genome_DB.cluster.debug.tsv') with open(str(debug_file), 'w') as f: for cluster_rep, members in debug_dict.items(): for m in members: f.write(f"{cluster_rep}\t{m}\n") # Repair summary tsv by removing rows til only 1 gene per sample is represented within a cluster # Determine index of rows to filter out row_indexes_to_remove = [] for i, row in enumerate(df.itertuples()): cluster_rep = row.cluster_rep cluster_member = row.cluster_member if cluster_rep in to_repair_dict.keys(): # Member sample_IDs that need to be reduced members_dict = to_repair_dict[cluster_rep] members_list = list(members_dict.keys()) # Detect if this is a row we want to remove for m in members_list: if m in cluster_member: # Check counter dict to see if there is > 1 member for cluster if to_repair_dict[cluster_rep][m] > 1: log.debug(f"Repairing {cluster_rep}") row_indexes_to_remove.append(i) # Decrement the counter dict to_repair_dict[cluster_rep][m] -= 1 else: continue # Create filtered version of df with duplicate rows removed filtered_df = df.drop(df.index[row_indexes_to_remove]) # Export to TSV file outpath = outdir / cluster_tsv.with_suffix(".filtered.tsv").name filtered_df.to_csv(outpath, sep="\t", header=None, index=None) get_difference_between_cluster_tsvs(cluster_tsv=cluster_tsv, filtered_cluster_tsv=outpath) return outpath