Ejemplo n.º 1
0
def setup_bot(bot):
    # Argument Handling
    bot.debug = any("debug" in arg.lower() for arg in sys.argv)

    # Logging
    init_sentry(bot)
    bot.sentry = sentry
    discord_log = logging.getLogger("discord")
    discord_log.setLevel(logging.CRITICAL if not bot.debug else logging.INFO)
    log = logging.getLogger("bot")
    bot.log = log
    log.info(f"\n{get_banner()}\nLoading....")

    # Load modules
    bot.session = aiohttp.ClientSession(loop=bot.loop)
    bot.load_extension("modules.Events.Ready")

    # Database
    credentials = {
        "user": os.getenv("POSTGRES_USER"),
        "password": os.getenv("POSTGRES_PASS"),
        "database": os.getenv("POSTGRES_DATABASE"),
        "host": os.getenv("POSTGRES_HOST"),
        "init": init_connection
    }
    bot.pool = bot.loop.run_until_complete(asyncpg.create_pool(**credentials))
    bot.log.info(
        f"Postgres connected to database ({bot.pool._working_params.database})"
        f" under the ({bot.pool._working_params.user}) user")

    # Config
    bot.config = get_config
    bot.kclient = KClient(api_key=os.getenv("KSOFT"))
    bot.spotify = spotipy.Spotify(
        client_credentials_manager=SpotifyClientCredentials(
            client_id=os.getenv("SPOTIFY_ID"),
            client_secret=os.getenv("SPOTIFY_SECRET")))
    bot.uptime = datetime.datetime.utcnow()
    bot.version = {
        "bot": bot.config()["version"],
        "python": sys.version.split(" ")[0],
        "discord.py": discord.__version__
    }
    bot.counter = Counter()
    bot.commands_used = Counter()
    bot.process = psutil.Process()
    bot.color = bot.config()["colors"]["main"]
    bot.error_color = bot.config()["colors"]["error"]
Ejemplo n.º 2
0
def count_text(words):
    """
	This function will count the words in the list given
	:param words: a list of words
	:return: a dict where the key is the word and the value is the frequency of the word
	"""
    return Counter(words)
Ejemplo n.º 3
0
 def build_vocab(self, *args, **kwargs):
     """Add unaligned_token to the list of special symbols."""
     counter = Counter()
     sources = []
     for arg in args:  # arg是QEDataset类,里面包括examples和fields
         if isinstance(arg, data.Dataset):
             sources += [
                 getattr(arg, name) for name, field in arg.fields.items()
                 if field is self
             ]  # source是列表,列表中元素是迭代器
         else:
             sources.append(arg)
     for sample in sources:
         for x in sample:  # 每次循环读取一个样本,将样本处理成list形式,然后更新counter
             if not self.sequential:
                 x = [x]
             try:
                 counter.update(x)
             except TypeError:
                 counter.update(chain.from_iterable(x))
     specials = list(
         OrderedDict.fromkeys(tok for tok in [
             self.unk_token, self.pad_token, self.init_token,
             self.eos_token, self.unaligned_token
         ] if tok is not None))  # ['<unk>', '<pad>', '<unaligned>']
     self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
Ejemplo n.º 4
0
 def build_vocab(self, *args, **kwargs):
     """Add unaligned_token to the list of special symbols."""
     counter = Counter()
     sources = []
     for arg in args:
         if isinstance(arg, data.Dataset):
             sources += [
                 getattr(arg, name) for name, field in arg.fields.items()
                 if field is self
             ]
         else:
             sources.append(arg)
     for sample in sources:
         for x in sample:
             if not self.sequential:
                 x = [x]
             try:
                 counter.update(x)
             except TypeError:
                 counter.update(chain.from_iterable(x))
     specials = list(
         OrderedDict.fromkeys(tok for tok in [
             self.unk_token,
             self.pad_token,
             self.init_token,
             self.eos_token,
             self.unaligned_token,
         ] if tok is not None))
     self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
Ejemplo n.º 5
0
    def reset(self):
        self.tok2id = dict()
        self.id2tok = dict()

        self._add_special_tokens()
        self._set_special_token_ids()
        self.vocab = Counter()
Ejemplo n.º 6
0
def get_product_count_dict_from_prokka(ffn_file: Path) -> dict:
    """ Reads Prokka fasta file and returns a dict of counts of each product name extracted from the headers """
    product_list = []
    with open(str(ffn_file)) as f:
        for line in f.readlines():
            if line.startswith(">"):
                line = line.strip()
                product = get_product_from_prokka_fasta_header(line)
                product_list.append(product)
    product_count_dict = dict(Counter(product_list))
    return product_count_dict
Ejemplo n.º 7
0
def get_seperate_list(raw_result):
    """
    seperate a n-dimension list into n list with group by each item
    ex: gen_seperate_list([[1,'b'],[2,'a']]) >>> [1,2],['b','a']
    :param raw_result:
    :return:
    """

    #get raw_result's item's count
    count = Counter(raw_result[0]).__len__()
    all = Counter(raw_result).__len__()
    i = 0
    j = 0

    while j < count:
        new_list = []
        while i < all:
            new_list.append(raw_result[i][j])
            i += 1
        yield (new_list)
        i = 0
        j += 1
Ejemplo n.º 8
0
    def load_from_vocab_file(self, file):
        """
        Load vocabulary from a .vocab file
        """

        self.tok2id = dict()
        self.id2tok = dict()

        self._add_special_tokens()
        self._set_special_token_ids()
        self.vocab = Counter()

        for line in open(file, encoding="utf-8").readlines():
            token, count = line.split("\t")
            self.vocab[token] = float(count)
            self.add_token(token)
Ejemplo n.º 9
0
    def __init__(self,
                 pad="<pad>",
                 sos="<sos>",
                 eos="<eos>",
                 unk="<unk>",
                 oovs=0):
        self.PAD = pad
        self.SOS = sos
        self.EOS = eos
        self.UNK = unk
        self.oovs = oovs

        self.vocab = Counter()

        self.tok2id = dict()
        self.id2tok = dict()

        self.size = 0

        self.subword = None
Ejemplo n.º 10
0
    def __init__(self,
                 pad="<pad>",
                 sos="<sos>",
                 eos="<eos>",
                 unk="<unk>",
                 oovs=0,
                 file=None,
                 preprocess=None,
                 subword=None,
                 lang=None):
        self.PAD = pad
        self.SOS = sos
        self.EOS = eos
        self.UNK = unk

        self.PAD_id = 0
        self.SOS_id = None
        self.EOS_id = None
        self.UNK_id = None

        self.lang = lang

        self.oovs = oovs

        self.vocab = Counter()

        self.tok2id = dict()
        self.id2tok = dict()
        self.freqs = dict()

        self.gpt2_tokenizer = None
        self.is_gpt2 = False

        self.subword = subword

        if file is not None:
            assert preprocess is not None, "Need preprocess() to build vocab!"
            self.build(file, preprocess)
Ejemplo n.º 11
0
    def fit_vocab(
        self,
        samples,
        vocab_size=None,
        vocab_min_freq=0,
        embeddings_name=None,
        keep_rare_words_with_embeddings=False,
        add_embeddings_vocab=False,
    ):
        tokens = Counter()
        for sample in samples:
            # TODO: subtokenize?
            tokens.update(self.tokenize(sample))

        # We use our own Vocabulary class
        specials = list(
            OrderedDict.fromkeys(
                tok for tok in [self.unaligned_token] if tok is not None
            )
        )
        # TODO: handle embeddings/vectors
        self.vocab = Vocabulary(
            tokens,
            max_size=vocab_size,
            min_freq=vocab_min_freq,
            unk_token=self.unk_token,
            pad_token=self.pad_token,
            bos_token=self.bos_token,
            eos_token=self.eos_token,
            specials=specials,
            specials_first=self.specials_first,
            # TODO: missing vectors, etc.
            vectors=None,
            rare_with_vectors=keep_rare_words_with_embeddings,
            add_vectors_vocab=add_embeddings_vocab,
        )
Ejemplo n.º 12
0
def setup_bot(bot):
    bot.config = bot.loop.run_until_complete(get_config())
    bot.remove_command('help')
    discord_log = logging.getLogger('discord')
    log = logging.getLogger("main")
    discord_log.setLevel(logging.CRITICAL)
    bot.log, bot.logger, = log, log
    log.info(f"{get_icon()}\nLoading....\n")
    bot.formatter = EmbededHelp()
    bot.debug = any('debug' in arg.lower() for arg in sys.argv)
    bot.uptime = datetime.datetime.utcnow()
    bot.commands_used = Counter()
    bot.messages_sent = 0
    bot.process = psutil.Process()
    bot.at_everyone_seen = 0
    bot.session = aiohttp.ClientSession(loop=bot.loop)
    bot.keys = bot.loop.run_until_complete(get_keys())
    bot.config = bot.loop.run_until_complete(get_config())
    bot.conn = bot.loop.run_until_complete(
        r.connect("localhost", db=bot.config['db'], port=28015))
    bot.loop.run_until_complete(module_loader(bot, "handlers"))
    bot.loop.run_until_complete(module_loader(bot, "commands"))
    if bot.debug:
        discord_log.setLevel(logging.INFO)
Ejemplo n.º 13
0
 def populate_product_occurence_dictionary(self):
     """ Counts the number of times any product occurs in self.total_gene_list and stores in dict """
     self.product_occurence_dictionary = Counter(self.total_product_list)
Ejemplo n.º 14
0
def get_member_counts(df: pd.DataFrame) -> dict:
    # Get a complete list of all cluster_rep values including duplicates, do total counts
    raw_cluster_rep_list = list(df['cluster_rep'])
    member_counts = Counter(raw_cluster_rep_list)
    return member_counts
def filter_core_cluster_tsv(cluster_tsv: Path, outdir: Path):
    """
    Filters/repairs the core cluster .tsv output file to ensure there are no cases where a single sample is represented
    in a cluster more than once. Previous versions of MMSeqs2 would sometimes encounter minor issues with clustering,
    and this method attempts to fix this. TODO: Verify this is still necessary

    :param cluster_tsv: path to cluster .tsv file
    :param outdir: path desired output directory
    :return: path to filtered/repaired cluster .tsv
    """

    # Read TSV
    df = cluster_tsv_to_df(cluster_tsv=cluster_tsv)

    # TODO: Refactor some of the logic below into clean functions
    # Get the cluster member dict
    cluster_member_dict = get_cluster_member_dict(df)

    # Determine which clusters contain duplicate samples, create a dictionary to store them
    to_repair_dict = {}
    for cluster_rep, members in cluster_member_dict.items():
        sample_id_list = []
        for m in members:
            sample_id = m.rsplit("_", 1)[0]
            sample_id_list.append(sample_id)
        counter_dict = Counter(sample_id_list)
        problematic_sample_id_dict = {
            x: counter_dict[x]
            for x in counter_dict if counter_dict[x] > 1
        }
        if len(problematic_sample_id_dict) >= 1:
            log.debug(
                f"Warning: Detected problematic samples: {problematic_sample_id_dict}"
            )
            to_repair_dict[cluster_rep] = problematic_sample_id_dict
        else:
            continue

    # Write a file containing all clusters/members flagged as problematic
    debug_dict = {}
    for cluster_rep, members in cluster_member_dict.items():
        sample_id_list = []
        for m in members:
            sample_id = m.rsplit("_", 1)[0]
            sample_id_list.append(sample_id)
        counter_dict = Counter(sample_id_list)
        problematic_sample_id_dict = {
            x: counter_dict[x]
            for x in counter_dict if counter_dict[x] > 1
        }
        if len(problematic_sample_id_dict) >= 1:
            debug_list = []
            for s in list(problematic_sample_id_dict.keys()):
                for m in members:
                    if s in m:
                        debug_list.append(m)
            debug_dict[cluster_rep] = debug_list

    debug_file = Path(outdir / 'master_genome_DB.cluster.debug.tsv')
    with open(str(debug_file), 'w') as f:
        for cluster_rep, members in debug_dict.items():
            for m in members:
                f.write(f"{cluster_rep}\t{m}\n")

    # Repair summary tsv by removing rows til only 1 gene per sample is represented within a cluster
    # Determine index of rows to filter out
    row_indexes_to_remove = []
    for i, row in enumerate(df.itertuples()):
        cluster_rep = row.cluster_rep
        cluster_member = row.cluster_member
        if cluster_rep in to_repair_dict.keys():
            # Member sample_IDs that need to be reduced
            members_dict = to_repair_dict[cluster_rep]
            members_list = list(members_dict.keys())

            # Detect if this is a row we want to remove
            for m in members_list:
                if m in cluster_member:
                    # Check counter dict to see if there is > 1 member for cluster
                    if to_repair_dict[cluster_rep][m] > 1:
                        log.debug(f"Repairing {cluster_rep}")
                        row_indexes_to_remove.append(i)
                        # Decrement the counter dict
                        to_repair_dict[cluster_rep][m] -= 1
        else:
            continue

    # Create filtered version of df with duplicate rows removed
    filtered_df = df.drop(df.index[row_indexes_to_remove])

    # Export to TSV file
    outpath = outdir / cluster_tsv.with_suffix(".filtered.tsv").name
    filtered_df.to_csv(outpath, sep="\t", header=None, index=None)
    get_difference_between_cluster_tsvs(cluster_tsv=cluster_tsv,
                                        filtered_cluster_tsv=outpath)
    return outpath