class MixedFoodCatalog(Catalog):
    def get_list(self):
        return self.__food

    def __str__(self):
        return "mixed food catalog"

    def __init__(self, mixed_tidy_food_file_path):
        super().__init__()
        self.mixed_tidy_food_file_path = mixed_tidy_food_file_path
        self.__hash_tree = None
        self.__food = None

    def initialize(self):
        t1 = time()
        logger.info('Creating mixed food catalog...')
        with open(self.mixed_tidy_food_file_path) as file:
            food = file.read().splitlines()

        self.__hash_tree = HashTree(food)
        self.__food = food

        t2 = time()
        logger.info('Done creating mixed food catalog. Total time: %.2f sec.' % (t2 - t1))

    def find(self, sentence_text):
        food_names = self.__hash_tree.search(sentence_text)
        entities = EntityCollection([Entity(name, 'nogroup', FOOD_TAG) for name in food_names], FOOD_TAG)
        return entities
class PrebioticsCatalog(Catalog):
    def get_list(self):
        pass

    def __init__(self, tidy_csv_path):
        super().__init__()
        self.tidy_csv_path = tidy_csv_path
        self.__hash_tree = None
        self.__prebiotics = None

    def initialize(self):
        t1 = time()
        logger.info('Creating prebiotics catalog...')
        with open(self.tidy_csv_path) as file:
            prebiotics = file.read().splitlines()

        self.__hash_tree = HashTree(prebiotics)
        self.__prebiotics = prebiotics

        t2 = time()
        logger.info('Done creating prebiotics catalog. Total time: %.2f sec.' % (t2 - t1))

    def find(self, sentence_text):
        prebiotic_names = self.__hash_tree.search(sentence_text)

        entities = EntityCollection([Entity(name, 'noid', PREBIOTIC_TAG) for name in prebiotic_names], PREBIOTIC_TAG)
        return entities
class UsdaFoodCatalog(Catalog):
    def get_list(self):
        return list(self.__food_data_frame['group'].drop_duplicates()) + list(self.__food_data_frame['name'])

    def __str__(self):
        return "usda food catalog"

    def __init__(self, food_file_path):
        super().__init__()
        self.food_file_path = food_file_path
        self.__hash_tree = None
        self.__food_dict = None
        self.__group_by_food_name = None
        self.__food_data_frame = None

    def initialize(self):
        t1= time()
        logger.info('Creating food catalog...')
        self.__food_data_frame = pd.read_table(self.food_file_path, sep=',')
        self.__food_dict = {food_group: [] for food_group in self.__food_data_frame['group'].values}
        for index, record in self.__food_data_frame.iterrows():
            self.__food_dict[record['group']].append(record['name'].strip())

        self.__group_by_food_name = {food: group for group, food_list in self.__food_dict.items() for food in food_list}
        self.__hash_tree = HashTree(self.__group_by_food_name.keys())

        t2 = time()
        logger.info('Done creating food catalog. Total time: %.2f sec.' % (t2 - t1))

    def find(self, sentence_text):
        food_names = self.__hash_tree.search(sentence_text)
        entities = EntityCollection([Entity(name, self.__group_by_food_name[name], FOOD_TAG) for name in food_names],
                                    FOOD_TAG)
        return entities
    def initialize(self):
        t1 = time()
        logger.info('Creating mixed food catalog...')
        with open(self.mixed_tidy_food_file_path) as file:
            food = file.read().splitlines()

        self.__hash_tree = HashTree(food)
        self.__food = food

        t2 = time()
        logger.info('Done creating mixed food catalog. Total time: %.2f sec.' % (t2 - t1))
    def initialize(self):
        t1 = time()
        logger.info('Creating prebiotics catalog...')
        with open(self.tidy_csv_path) as file:
            prebiotics = file.read().splitlines()

        self.__hash_tree = HashTree(prebiotics)
        self.__prebiotics = prebiotics

        t2 = time()
        logger.info('Done creating prebiotics catalog. Total time: %.2f sec.' % (t2 - t1))
    def initialize(self):
        t1= time()
        logger.info('Creating food catalog...')
        self.__food_data_frame = pd.read_table(self.food_file_path, sep=',')
        self.__food_dict = {food_group: [] for food_group in self.__food_data_frame['group'].values}
        for index, record in self.__food_data_frame.iterrows():
            self.__food_dict[record['group']].append(record['name'].strip())

        self.__group_by_food_name = {food: group for group, food_list in self.__food_dict.items() for food in food_list}
        self.__hash_tree = HashTree(self.__group_by_food_name.keys())

        t2 = time()
        logger.info('Done creating food catalog. Total time: %.2f sec.' % (t2 - t1))
    def initialize(self):
        t1 = time()
        constants.logger.info('Creating diseases catalog...')

        data = pd.read_csv(self.diseases_csv_path, sep="\t")
        data = data[['id', 'name']]
        data_dict = data.to_dict("records")
        for row in data_dict:
            self.disease_dictionary[row['name']] = row['id']
        self.hash_tree = HashTree(self.disease_dictionary.keys())

        t2 = time()
        constants.logger.info('Done creating diseases catalog. Total time: %.2f sec.' % (t2 - t1))
    def initialize(self):
        t1 = time()
        constants.logger.info('Creating nutrients catalog...')

        data = pd.read_csv(self.path, sep="\t")
        self.__nutrients_by_idname = {idname: [] for idname in data['idname'].values}
        for index, record in data.iterrows():
            self.__nutrients_by_idname[record['idname']].append(record['name'])

        self.__idname_by_nutrient = {name: idname for idname, name_list in
                                     self.__nutrients_by_idname.items() for name in name_list}
        self.__hash_tree = HashTree(self.__idname_by_nutrient.keys())

        t2 = time()
        constants.logger.info('Done creating nutrients catalog. Total time: %.2f sec.' % (t2 - t1))
class NutrientsCatalogNikogosov(Catalog):
    """Object holding nutrient ontology"""

    def __init__(self, path):
        self.path = path
        self.__nutrients_by_idname = None
        self.__idname_by_nutrient = None
        self.__hash_tree = None

    def initialize(self):
        t1 = time()
        constants.logger.info('Creating nutrients catalog...')

        data = pd.read_csv(self.path, sep="\t")
        self.__nutrients_by_idname = {idname: [] for idname in data['idname'].values}
        for index, record in data.iterrows():
            self.__nutrients_by_idname[record['idname']].append(record['name'])

        self.__idname_by_nutrient = {name: idname for idname, name_list in
                                     self.__nutrients_by_idname.items() for name in name_list}
        self.__hash_tree = HashTree(self.__idname_by_nutrient.keys())

        t2 = time()
        constants.logger.info('Done creating nutrients catalog. Total time: %.2f sec.' % (t2 - t1))

    def find(self, sentence_text):
        """ Uses previously generated hash tree to search sentence for nutrient names

        input:
            sentence: sentence to search for nutrient names

        returns:
            list of nutrient_names
        """
        nutr_names = self.__hash_tree.search(sentence_text)
        entities = EntityCollection([Entity(nutrient,
                                            self.__idname_by_nutrient[nutrient],
                                            NUTRIENT_TAG) for nutrient in nutr_names])
        return entities

    def get_list(self):
        nutrients = []
        for key, value in self.__nutrients_by_idname.items():
            nutrients.append(value[0])
        return nutrients
    def initialize(self):
        """Creation of catalog object
        input:
            :param verbose:
        creates:
            self.__scientific_names: dictionary with NCBI_id as key and scientific bacteria name as value
            self.__bact_id_dict: dictionary with various versions of bacterial names as keys and NCBI_id as value
            self.hash_tree_root: root node of hash tree
        """
        t1 = time()
        logger.info('Creating all bacterial catalog...')

        names = pd.read_table(self.all_bact_path, sep=',')
        names_scientific = self.sci_names(table_names=names)
        self.__scientific_names = names_scientific.set_index('id').T.to_dict('records')[0]
        self.__bact_id_dict = names[['name', 'id']].set_index('name').T.to_dict('records')[0]
        self.__hash_tree = HashTree(self.__bact_id_dict.keys())

        t2 = time()
        logger.info('Done creating bacterial catalog. Total time: %.2f sec.' % (t2 - t1))
class DiseasesCatalog(Catalog):
    def get_list(self):
        pass

    def __str__(self):
        return "diseases catalog"

    def __init__(self, diseases_csv_path):
        self.disease_dictionary = {}
        self.hash_tree = None
        self.diseases_csv_path = diseases_csv_path

    def initialize(self):
        t1 = time()
        constants.logger.info('Creating diseases catalog...')

        data = pd.read_csv(self.diseases_csv_path, sep="\t")
        data = data[['id', 'name']]
        data_dict = data.to_dict("records")
        for row in data_dict:
            self.disease_dictionary[row['name']] = row['id']
        self.hash_tree = HashTree(self.disease_dictionary.keys())

        t2 = time()
        constants.logger.info('Done creating diseases catalog. Total time: %.2f sec.' % (t2 - t1))

    def find(self, sentence_text):
        """ Uses previously generated hash tree to search sentence for nutrient names

        input:
            sentence: sentence to search for nutrient names

        returns:
            list of nutrient_names
        """
        sentence_text = re.sub('[’\']', '', sentence_text)
        diseases_names = self.hash_tree.search(sentence_text)
        entities = EntityCollection([Entity(name,
                                            self.disease_dictionary[name],
                                            DISEASE_TAG) for name in diseases_names], DISEASE_TAG)
        return entities
class AllBacteriaCatalog(Catalog):
    """Object holding NCBI ontology"""

    def get_list(self):
        pass

    def __str__(self):
        return "all bacteria catalog"

    def __init__(self, all_bact_path):
        self.all_bact_path = all_bact_path
        self.__scientific_names = None
        self.__bact_id_dict = None
        self.__hash_tree = None

    def initialize(self):
        """Creation of catalog object
        input:
            :param verbose:
        creates:
            self.__scientific_names: dictionary with NCBI_id as key and scientific bacteria name as value
            self.__bact_id_dict: dictionary with various versions of bacterial names as keys and NCBI_id as value
            self.hash_tree_root: root node of hash tree
        """
        t1 = time()
        logger.info('Creating all bacterial catalog...')

        names = pd.read_table(self.all_bact_path, sep=',')
        names_scientific = self.sci_names(table_names=names)
        self.__scientific_names = names_scientific.set_index('id').T.to_dict('records')[0]
        self.__bact_id_dict = names[['name', 'id']].set_index('name').T.to_dict('records')[0]
        self.__hash_tree = HashTree(self.__bact_id_dict.keys())

        t2 = time()
        logger.info('Done creating bacterial catalog. Total time: %.2f sec.' % (t2 - t1))

    def find(self, sentence_text):
        """ Uses previously generated hash tree to search sentence for bacterial names

        input:
            sentence: sentence to search for bacterial names

        returns:
            list of (bactrium_name, NCBI_id) tuples found in sentence
            :param sentence_text:
        """

        bact_names = self.__hash_tree.search(sentence_text)
        bact_ids = [self.__bact_id_dict[name] for name in bact_names]
        output_list = list(zip(bact_names, bact_ids))

        entities = EntityCollection([Entity(name,
                                            code,
                                            BACTERIA_TAG,
                                            [ALL_BACTERIA_TAG]) for name, code in output_list], BACTERIA_TAG)
        return entities

    def get_scientific_name(self, ncbi_id):
        return self.__scientific_names[ncbi_id]

    def sci_names(self, table_names):
        names_scientific = table_names.loc[(table_names['class'] == 'scientific name') &
                                       (~table_names['id'].isnull()),
                                       ['name', 'id']].drop_duplicates(subset=['id'])
        return names_scientific