def initialize(self):
        t1 = time()
        logger.info('Creating prebiotics catalog...')
        with open(self.tidy_csv_path) as file:
            prebiotics = file.read().splitlines()

        self.__hash_tree = HashTree(prebiotics)
        self.__prebiotics = prebiotics

        t2 = time()
        logger.info('Done creating prebiotics catalog. Total time: %.2f sec.' % (t2 - t1))
    def initialize(self):
        t1 = time()
        logger.info('Creating mixed food catalog...')
        with open(self.mixed_tidy_food_file_path) as file:
            food = file.read().splitlines()

        self.__hash_tree = HashTree(food)
        self.__food = food

        t2 = time()
        logger.info('Done creating mixed food catalog. Total time: %.2f sec.' % (t2 - t1))
    def initialize(self):
        t1= time()
        logger.info('Creating food catalog...')
        self.__food_data_frame = pd.read_table(self.food_file_path, sep=',')
        self.__food_dict = {food_group: [] for food_group in self.__food_data_frame['group'].values}
        for index, record in self.__food_data_frame.iterrows():
            self.__food_dict[record['group']].append(record['name'].strip())

        self.__group_by_food_name = {food: group for group, food_list in self.__food_dict.items() for food in food_list}
        self.__hash_tree = HashTree(self.__group_by_food_name.keys())

        t2 = time()
        logger.info('Done creating food catalog. Total time: %.2f sec.' % (t2 - t1))
    def __init__(self, catalog_list, sentence_parser, sentence_analyzer, tags_to_search, tags_optional_to_search,
                 tags_to_exclude=None):
        super().__init__()
        if tags_to_exclude is None:
            tags_to_exclude = []
        self.tags_to_exclude = tags_to_exclude
        self.catalog_list = catalog_list
        self.tags_optional = set(tags_optional_to_search)
        self.tags = set(tags_to_search)
        self.sentence_analyzer = sentence_analyzer
        self.sentence_parser = sentence_parser
        self.nlp = spacy.load('en')

        logger.info(
            "search will start with catalogs: %s\nand tags req: %s\ntags opt: %s\ntags excl: %s" % (catalog_list,
                                                                                                    tags_to_search,
                                                                                                    tags_optional_to_search,
                                                                                                    tags_to_exclude))
    def initialize(self):
        """Creation of catalog object
        input:
            :param verbose:
        creates:
            self.__scientific_names: dictionary with NCBI_id as key and scientific bacteria name as value
            self.__bact_id_dict: dictionary with various versions of bacterial names as keys and NCBI_id as value
            self.hash_tree_root: root node of hash tree
        """
        t1 = time()
        logger.info('Creating all bacterial catalog...')

        names = pd.read_table(self.all_bact_path, sep=',')
        names_scientific = self.sci_names(table_names=names)
        self.__scientific_names = names_scientific.set_index('id').T.to_dict('records')[0]
        self.__bact_id_dict = names[['name', 'id']].set_index('name').T.to_dict('records')[0]
        self.__hash_tree = HashTree(self.__bact_id_dict.keys())

        t2 = time()
        logger.info('Done creating bacterial catalog. Total time: %.2f sec.' % (t2 - t1))
def parse_analyze(parser, stanford_tokenizer, text, names):
    start = time.time()
    # todo: make more obvious (use dict?)
    bacteria_names = names[0]
    nutrient_names = names[1]
    diseases_names = names[2]
    food_names = names[3]
    try:
        parser_output = parser.parse_sentence(text)
        analyze_output = analyze_sentence(bacterial_names=bacteria_names, nutrient_names=nutrient_names,
                                          disease_names=diseases_names, food_names=food_names,
                                          parser_output=parser_output, tokenizer=stanford_tokenizer)
    except Exception as error:
        logger.error(error)
        end = time.time()
        logger.info("===\nparsed/analyzed error: %s, \ntime: %f" % (text, end - start))
        return [None, None]

    end = time.time()
    logger.info("===\nparsed/analyzed: %s, \ntime: %f" % (text, end - start))
    return [parser_output, analyze_output]
    def get_sentence(self, sentence_text, article):
        if not self.check_if_title(article.title):
            return None

        if len(sentence_text) > SENTENCE_LENGTH_THRESHOLD:
            return None

        entities_collections = []
        for catalog in self.catalog_list:
            found_entities = catalog.find(sentence_text)
            entities_collections.append(found_entities)

        tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0])

        if not self.check_if_tags(tags_in_sentence):
            return None

        tokens = self.nlp(sentence_text)
        tokens_words = [token.orth_ for token in tokens]

        # todo: check - if we found bacteria both in gut_catalog and all_bacteria_catalog - which we would keep?
        logger.info("entities before remove overlapping: %s" % str(entities_collections))
        entities_collections = remove_entity_overlapping(entities_collections, tokens_words)
        logger.info("entities after remove overlapping: %s" % str(entities_collections))

        # separate all several-words-names by underscope (_)
        for collection in entities_collections:
            for entity in collection.entities:
                dashed_name = entity.name.replace(' ', '_')
                sentence_text = sentence_text.replace(entity.name, dashed_name)
                entity.name = dashed_name

        # remove bad entities
        for collection in entities_collections:
            bad_entities = [x for x in collection.entities
                            if any(y in self.tags_to_exclude for y in x.additional_tags) or
                            x.tag in self.tags_to_exclude]
            for entity in bad_entities:
                collection.entities.remove(entity)

        entities_collections = [collection for collection in entities_collections if len(collection.entities) > 0]
        tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0])

        logger.info("entities after excluding: %s" % str(entities_collections))
        if not self.check_if_tags(tags_in_sentence):
            return None

        tokens = self.nlp(sentence_text)

        # entities list for parser
        all_entities_list = []
        for collection in entities_collections:
            all_entities_list.extend(collection.entities)

        parser_output = self.sentence_parser.parse_sentence(sentence_text, all_entities_list, tokens)

        paths = self.sentence_analyzer.analyze_sentence(parser_output, tags_in_sentence)

        sentence = Sentence(text=sentence_text,
                            article=article,
                            entities_collections=entities_collections,
                            parser_output=parser_output,
                            shortest_paths=paths)

        return sentence
    sentences_data = pd.read_csv(file, sep='\t')
    # ast.literal_eval(...) - to parse python lists
    # .tolist()[0] - to transform pandas Series to list of one str element and take this one element
    sentences_dictionary = {item: [ast.literal_eval(group["bacteria"].tolist()[0]),
                                   ast.literal_eval(group["nutrients"].tolist()[0]),
                                   ast.literal_eval(group["diseases"].tolist()[0]),
                                   ast.literal_eval(group["food"].tolist()[0])] for item, group in
                            sentences_data.groupby("text")}

    if not os.path.exists("out"):
        os.mkdir("out")

    stanford_tokenizer = StanfordTokenizer(path_to_jar=stanford_jar_path)
    parser = SpacySentenceParser()

    logger.info("start parse sentences")
    names_dictionary = {}
    logger.info("total number of sentences before filter: %i" % len(sentences_dictionary))
    # 1. prepare names
    for key, value in sentences_dictionary.items():
        # key is sentence text
        # value[0] is bacteria
        # value[1] is nutrient
        # value[2] is disease
        # value[3] is food
        # todo: make more obviuos (use a dict?)

        # filter:
        # 562 is E. coli
        # 1496 is Clostridium difficile
        # 590 is Salmonella
 def initialize(self):
     logger.info("created do nothing catalog")