def init_y_annotations(self): """ initialises the joint y vector with data from manually annotated abstracts """ logging.info("Identifying seed data from annotated data") p = progressbar.ProgressBar(len(self.biviewer), timer=True) annotation_viewer = LabeledAbstractReader() counter = 0 for study in annotation_viewer: study_id = int(study["Biview_id"]) text = swap_num(annotation_viewer.get_biview_id(study_id)['abstract']) parsed_tags = tag_words(text, flatten=True) tagged_number = [w[0] for w in parsed_tags if 'n' in w[1]] if tagged_number: number = re.match("[Nn]?=?([1-9]+[0-9]*)", tagged_number[0]) if number: self.data["y_lookup_init"][study_id] = int(number.group(1)) counter += 1 else: raise TypeError('Unable to convert tagged number %s to integer', tagged_number[0]) self.seed_abstracts = counter logging.info("%d seed abstracts found", counter)
def __init__(self, text): self.text = text self.functions = [[{ "w": word } for word in self.word_tokenize(sent)] for sent in self.sent_tokenize(swap_num(text))] self.load_templates()
def init_y_annotations(self): """ initialises the joint y vector with data from manually annotated abstracts """ logging.info("Identifying seed data from annotated data") p = progressbar.ProgressBar(len(self.biviewer), timer=True) annotation_viewer = LabeledAbstractReader() counter = 0 for study in annotation_viewer: study_id = int(study["Biview_id"]) text = swap_num( annotation_viewer.get_biview_id(study_id)['abstract']) parsed_tags = tag_words(text, flatten=True) tagged_number = [w[0] for w in parsed_tags if 'n' in w[1]] if tagged_number: number = re.match("[Nn]?=?([1-9]+[0-9]*)", tagged_number[0]) if number: self.data["y_lookup_init"][study_id] = int(number.group(1)) counter += 1 else: raise TypeError( 'Unable to convert tagged number %s to integer', tagged_number[0]) self.seed_abstracts = counter logging.info("%d seed abstracts found", counter)
def __init__(self, text): self.functions = [[{ "w": word, "p": pos } for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(text))] self.load_templates() self.text = text
def __init__(self, text, window_size): self.text = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', text) self.functions = [[{ "w": word, "p": pos } for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(text))] self.load_templates() self.w_pos_window = window_size self.text = text
def seed_y_regex(self, annotation_viewer): """ initialises the joint y vector with data from manually annotated abstracts filter_ids = ids of the MergedTaggedAbstractReader to pay attention to """ self.initialise() self.annotation_viewer_to_biviewer = {} self.answers = {} self.annotation_viewer = annotation_viewer logging.info("Generating answers for test set") p = progressbar.ProgressBar(len(self.annotation_viewer), timer=True) for study in range(len(self.annotation_viewer)): p.tap() biview_id = annotation_viewer[study]["biview_id"] self.annotation_viewer_to_biviewer[study] = biview_id # set answers parsed_tags = [item for sublist in annotation_viewer.get(study) for item in sublist] # flatten list tagged_numbers = [w[0] for w in parsed_tags if 'n' in w[1]] # then get any tagged numbers if tagged_numbers: number = int(tagged_numbers[0]) else: number = -2 self.answers[biview_id] = number logging.info("Generating seed data from regular expression") p = progressbar.ProgressBar(len(self.biviewer), timer=True) counter = 0 # number of studies initially found for study_id, (cochrane_dict, pubmed_dict) in enumerate(self.biviewer): p.tap() pubmed_text = pubmed_dict.get("abstract", "") # use simple rule to identify population sizes (low sens/recall, high spec/precision) pubmed_text = swap_num(pubmed_text) matches = re.findall('([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients|children|people) were (?:randomi[sz]ed)', pubmed_text) # matches += re.findall('(?:[Ww]e randomi[sz]ed )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text) # matches += re.findall('(?:[Aa] total of )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text) if len(matches) == 1: self.data["y_lookup_init"][study_id] = int(matches[0]) counter += 1 self.seed_abstracts = counter logging.info("%d seed abstracts found", counter)
def get_annotations(self, abstract, convert_numbers=True): ''' if convert_numbers is True, numerical strings (e.g., "twenty-five") will be converted to number ("25"). ''' if convert_numbers: abstract = swap_num(abstract) abstract = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', abstract) tags = tag_words(abstract) return tags
def __init__(self, text, window_size): if isinstance(text, str): self.text = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', text) self.text = swap_num(text) self.tag_tuple_sents = tag_words(self.text) elif isinstance(text, list): self.tag_tuple_sents = text self.functions = self.set_functions(self.tag_tuple_sents) self.w_pos_window = window_size self.load_templates()
def __init__(self, text, window_size): if isinstance(text, str): self.text = re.sub("(?:[0-9]+)\,(?:[0-9]+)", "", text) self.text = swap_num(text) self.tag_tuple_sents = tag_words(self.text) elif isinstance(text, list): self.tag_tuple_sents = text self.functions = self.set_functions(self.tag_tuple_sents) self.w_pos_window = window_size self.load_templates()
def __init__(self, text_dict, window_size): self.functions = [] for key, value in text_dict.iteritems(): self.functions.extend( [[{ "w": word, "p": pos, "cochrane_part": key } for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(value))]) self.load_templates() self.w_pos_window = window_size
def get_annotations(abstract_nr, annotator, convert_numbers=False): ''' if convert_numbers is True, numerical strings (e.g., "twenty-five") will be converted to number ("25"). ''' abstract = get_abstracts(annotator)[abstract_nr] if convert_numbers: abstract = swap_num(abstract) tags = tag_words(abstract) # tags = p.get_tags(flatten=True) # returns a list of tags return tags
def init_y_regex(self): """ initialises the joint y vector with data from a simple seed regex rule """ logging.info("Identifying seed data from regular expression") p = progressbar.ProgressBar(len(self.biviewer), timer=True) counter = 0 # number of studies initially found for study_id, (cochrane_dict, pubmed_dict) in enumerate(self.biviewer): p.tap() pubmed_text = pubmed_dict.get("abstract", "") # use simple rule to identify population sizes (low sens/recall, high spec/precision) pubmed_text = swap_num(pubmed_text) matches = re.findall('([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients) were (?:randomi[sz]ed)', pubmed_text) # matches += re.findall('(?:[Ww]e randomi[sz]ed )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text) # matches += re.findall('(?:[Aa] total of )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text) if len(matches) == 1: self.data["y_lookup_init"][study_id] = int(matches[0]) counter += 1 self.seed_abstracts = counter logging.info("%d seed abstracts found", counter)
def init_y_regex(self): """ initialises the joint y vector with data from a simple seed regex rule """ logging.info("Identifying seed data from regular expression") p = progressbar.ProgressBar(len(self.biviewer), timer=True) counter = 0 # number of studies initially found for study_id, (cochrane_dict, pubmed_dict) in enumerate(self.biviewer): p.tap() pubmed_text = pubmed_dict.get("abstract", "") # use simple rule to identify population sizes (low sens/recall, high spec/precision) pubmed_text = swap_num(pubmed_text) matches = re.findall( '([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients) were (?:randomi[sz]ed)', pubmed_text) # matches += re.findall('(?:[Ww]e randomi[sz]ed )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text) # matches += re.findall('(?:[Aa] total of )([1-9][0-9]*) (?:\w+ )*(?:participants|men|women|patients)', pubmed_text) if len(matches) == 1: self.data["y_lookup_init"][study_id] = int(matches[0]) counter += 1 self.seed_abstracts = counter logging.info("%d seed abstracts found", counter)
def __init__(self, text, window_size): self.text = re.sub('(?:[0-9]+)\,(?:[0-9]+)', '', text) self.functions = [[{"w": word, "p": pos} for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(text))] self.load_templates() self.w_pos_window = window_size self.text = text
def __init__(self, text_dict, window_size): self.functions = [] for key, value in text_dict.iteritems(): self.functions.extend([[{"w": word, "p": pos, "cochrane_part":key} for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(value))]) self.load_templates() self.w_pos_window = window_size
def __init__(self, text): self.functions = [[{"w": word, "p": pos} for word, pos in pos_tagger.tag(self.word_tokenize(sent))] for sent in self.sent_tokenize(swap_num(text))] self.load_templates() self.text = text
def __init__(self, text): self.text = text self.functions = [[{"w": word} for word in self.word_tokenize(sent)] for sent in self.sent_tokenize(swap_num(text))] self.load_templates()