def initialize(self): t1 = time() logger.info('Creating prebiotics catalog...') with open(self.tidy_csv_path) as file: prebiotics = file.read().splitlines() self.__hash_tree = HashTree(prebiotics) self.__prebiotics = prebiotics t2 = time() logger.info('Done creating prebiotics catalog. Total time: %.2f sec.' % (t2 - t1))
def initialize(self): t1 = time() logger.info('Creating mixed food catalog...') with open(self.mixed_tidy_food_file_path) as file: food = file.read().splitlines() self.__hash_tree = HashTree(food) self.__food = food t2 = time() logger.info('Done creating mixed food catalog. Total time: %.2f sec.' % (t2 - t1))
def initialize(self): t1= time() logger.info('Creating food catalog...') self.__food_data_frame = pd.read_table(self.food_file_path, sep=',') self.__food_dict = {food_group: [] for food_group in self.__food_data_frame['group'].values} for index, record in self.__food_data_frame.iterrows(): self.__food_dict[record['group']].append(record['name'].strip()) self.__group_by_food_name = {food: group for group, food_list in self.__food_dict.items() for food in food_list} self.__hash_tree = HashTree(self.__group_by_food_name.keys()) t2 = time() logger.info('Done creating food catalog. Total time: %.2f sec.' % (t2 - t1))
def __init__(self, catalog_list, sentence_parser, sentence_analyzer, tags_to_search, tags_optional_to_search, tags_to_exclude=None): super().__init__() if tags_to_exclude is None: tags_to_exclude = [] self.tags_to_exclude = tags_to_exclude self.catalog_list = catalog_list self.tags_optional = set(tags_optional_to_search) self.tags = set(tags_to_search) self.sentence_analyzer = sentence_analyzer self.sentence_parser = sentence_parser self.nlp = spacy.load('en') logger.info( "search will start with catalogs: %s\nand tags req: %s\ntags opt: %s\ntags excl: %s" % (catalog_list, tags_to_search, tags_optional_to_search, tags_to_exclude))
def initialize(self): """Creation of catalog object input: :param verbose: creates: self.__scientific_names: dictionary with NCBI_id as key and scientific bacteria name as value self.__bact_id_dict: dictionary with various versions of bacterial names as keys and NCBI_id as value self.hash_tree_root: root node of hash tree """ t1 = time() logger.info('Creating all bacterial catalog...') names = pd.read_table(self.all_bact_path, sep=',') names_scientific = self.sci_names(table_names=names) self.__scientific_names = names_scientific.set_index('id').T.to_dict('records')[0] self.__bact_id_dict = names[['name', 'id']].set_index('name').T.to_dict('records')[0] self.__hash_tree = HashTree(self.__bact_id_dict.keys()) t2 = time() logger.info('Done creating bacterial catalog. Total time: %.2f sec.' % (t2 - t1))
def parse_analyze(parser, stanford_tokenizer, text, names): start = time.time() # todo: make more obvious (use dict?) bacteria_names = names[0] nutrient_names = names[1] diseases_names = names[2] food_names = names[3] try: parser_output = parser.parse_sentence(text) analyze_output = analyze_sentence(bacterial_names=bacteria_names, nutrient_names=nutrient_names, disease_names=diseases_names, food_names=food_names, parser_output=parser_output, tokenizer=stanford_tokenizer) except Exception as error: logger.error(error) end = time.time() logger.info("===\nparsed/analyzed error: %s, \ntime: %f" % (text, end - start)) return [None, None] end = time.time() logger.info("===\nparsed/analyzed: %s, \ntime: %f" % (text, end - start)) return [parser_output, analyze_output]
def get_sentence(self, sentence_text, article): if not self.check_if_title(article.title): return None if len(sentence_text) > SENTENCE_LENGTH_THRESHOLD: return None entities_collections = [] for catalog in self.catalog_list: found_entities = catalog.find(sentence_text) entities_collections.append(found_entities) tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0]) if not self.check_if_tags(tags_in_sentence): return None tokens = self.nlp(sentence_text) tokens_words = [token.orth_ for token in tokens] # todo: check - if we found bacteria both in gut_catalog and all_bacteria_catalog - which we would keep? logger.info("entities before remove overlapping: %s" % str(entities_collections)) entities_collections = remove_entity_overlapping(entities_collections, tokens_words) logger.info("entities after remove overlapping: %s" % str(entities_collections)) # separate all several-words-names by underscope (_) for collection in entities_collections: for entity in collection.entities: dashed_name = entity.name.replace(' ', '_') sentence_text = sentence_text.replace(entity.name, dashed_name) entity.name = dashed_name # remove bad entities for collection in entities_collections: bad_entities = [x for x in collection.entities if any(y in self.tags_to_exclude for y in x.additional_tags) or x.tag in self.tags_to_exclude] for entity in bad_entities: collection.entities.remove(entity) entities_collections = [collection for collection in entities_collections if len(collection.entities) > 0] tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0]) logger.info("entities after excluding: %s" % str(entities_collections)) if not self.check_if_tags(tags_in_sentence): return None tokens = self.nlp(sentence_text) # entities list for parser all_entities_list = [] for collection in entities_collections: all_entities_list.extend(collection.entities) parser_output = self.sentence_parser.parse_sentence(sentence_text, all_entities_list, tokens) paths = self.sentence_analyzer.analyze_sentence(parser_output, tags_in_sentence) sentence = Sentence(text=sentence_text, article=article, entities_collections=entities_collections, parser_output=parser_output, shortest_paths=paths) return sentence
sentences_data = pd.read_csv(file, sep='\t') # ast.literal_eval(...) - to parse python lists # .tolist()[0] - to transform pandas Series to list of one str element and take this one element sentences_dictionary = {item: [ast.literal_eval(group["bacteria"].tolist()[0]), ast.literal_eval(group["nutrients"].tolist()[0]), ast.literal_eval(group["diseases"].tolist()[0]), ast.literal_eval(group["food"].tolist()[0])] for item, group in sentences_data.groupby("text")} if not os.path.exists("out"): os.mkdir("out") stanford_tokenizer = StanfordTokenizer(path_to_jar=stanford_jar_path) parser = SpacySentenceParser() logger.info("start parse sentences") names_dictionary = {} logger.info("total number of sentences before filter: %i" % len(sentences_dictionary)) # 1. prepare names for key, value in sentences_dictionary.items(): # key is sentence text # value[0] is bacteria # value[1] is nutrient # value[2] is disease # value[3] is food # todo: make more obviuos (use a dict?) # filter: # 562 is E. coli # 1496 is Clostridium difficile # 590 is Salmonella
def initialize(self): logger.info("created do nothing catalog")