def calc_words_metadata(s): tokenizer = RegexpTokenizer(u'[а-яё]+') mystem_gr_tokenizer = RegexpTokenizer(mystem_gr_tokens) mystem_gr_vectorizer = CountVectorizer( tokenizer=mystem_gr_tokenizer.tokenize, vocabulary=mystem_gr_vocab, binary=True) token_coords = tokenizer.span_tokenize(s) raw_gr_descs = [] word_indices = [] for i in token_coords: tn = s[i[0]:i[1]] if not tn in mystemmer_cache: d = mystemmer.analyze(tn) if len(d) < 1 or not 'analysis' in d[0] or len( d[0]['analysis']) < 1 or not 'gr' in d[0]['analysis'][0]: ext_gr_string = '' else: ext_gr_string = d[0]['analysis'][0]['gr'] mystemmer_cache[tn] = ext_gr_string gr_string = mystemmer_cache[tn] if 0 == len(gr_string): continue if i[1] < len(s): word_indices.append(i[1]) raw_gr_descs.append(gr_string) rows = mystem_gr_vectorizer.fit_transform(raw_gr_descs) rows = rows.toarray() result = {i[0]: i[1] for i in zip(word_indices, rows)} #del mystemmer return result
class ProcessorTokenizerNltkEn: """Performs tokenization of English texts. Wrapper around NLTK RegexpTokenizer. """ def __init__(self, delay_init=False, *args, **kwargs): self._proc = None if not delay_init: self.init(*args, **kwargs) def init(self, abbrevs=_en_abbrevs): if self._proc is None: _en_regex = '|'.join(abbrevs + _ru_rules) self._proc = RegexpTokenizer(_en_regex) def __call__(self, text): """Performs tokenization of text. Args: text(str): raw text. Returns: List of Token objects. """ return [ Token(text[start:end], start, end) for (start, end) in self._proc.span_tokenize(text) ]
def get_tokens(text, spellcheck=False): tokenizer = RegexpTokenizer(r'\w+') stemmer = PorterStemmer() # Basic tokens = [(s, text[s:e].lower()) for s, e in tokenizer.span_tokenize(text)] if spellcheck: tokens = [(i, correct(word)) for i, word in tokens] tokens = [(i, stemmer.stem(word)) for i, word in tokens] # Stop words and bigrams en_stopwords = stopwords.words('english') prev_token = None new_tokens = [] for i, word in tokens: if word in en_stopwords: prev_token = None else: new_tokens.append((i, word)) if prev_token is not None: prev_i, prev_word = prev_token new_tokens.append((prev_i, prev_word + ' ' + word)) prev_token = i, word tokens = new_tokens return tokens
def string_to_span(self, s): # creates a tokenized version and a span version of a string # helper to parse_xmls punctuation = "-,.?!:;" tokenizer = RegexpTokenizer("\s|:|;", gaps=True) tokenized = tokenizer.tokenize(s.lower()) tokenized = [word.strip(punctuation) if word[-1] in punctuation else word for word in tokenized] # removing punctuation if it's the last char in a word span = list(tokenizer.span_tokenize(s)) # gets the pythonic span i e (start, stop_but_not_including) new_span = [] for tpl in span: new_span.append((tpl[0], (tpl[1]-1))) # to get non-pythonic span i e (start,last_char) return new_span, tokenized
def highlight_phrases_from_list(t): # tokenize the text of the description, with spans tokenizer = RegexpTokenizer(r'[a-zA-Z0-9#+-]+') span_generator = tokenizer.span_tokenize(t) spans = [span for span in span_generator] tokens = [t[span[0]:span[1]] for span in spans] # # create a dictionary of the words, with spans as the values # # and another dictionary with the same keys, with the word indexes as the values char_span = defaultdict(list) word_index = defaultdict(list) for i, (k, span) in enumerate(zip(tokens, spans)): char_span[k].append(span) word_index[k].append(i) # is this useful? df = pd.DataFrame({'Character Index Spans': pd.Series(char_span), 'Word Indexes': pd.Series(word_index)}) highlight_spans = [] for skill_phrase in skill_phrase_wl: if word_index.get(skill_phrase[0]): for i, occurence in enumerate(word_index.get(skill_phrase[0])): if all((skill_phrase[j] == tokens[j+occurence]) for j in range(len(skill_phrase))): highlight_span = (spans[occurence][0], spans[occurence + len(skill_phrase) - 1][1]) highlight_spans.append (highlight_span) # # look up the words in our skill list in the dictionary. List the findings as spans to be highlighted # for skill in single_word_skills: # highlight_spans += char_span[skill] # Sort the spans to be highlighted highlight_spans.sort() # Insert html tags to highlight the keywords html_start_tag = '<font color="red">' html_end_tag = '</font>' highlighted = '' cursor = 0 for span in highlight_spans: if (span[0] > cursor): # go forwards only, not backwards if (cursor>0): highlighted += html_end_tag highlighted += t[cursor:span[0]] + html_start_tag + t[span[0]:span[1]] elif (span[1] > cursor): highlighted += t[cursor:span[1]] cursor = span[1] highlighted += html_end_tag + t[cursor:] display(HTML(highlighted))
class ProcessorTokenizerRu: """Performs tokenization of Russian texts with regexes. Wrapper around NLTK RegexpTokenizer. Supports Russian abbreviations. """ def __init__(self, delay_init=False): self._proc = None if not delay_init: self.init() def init(self): if self._proc is None: self._proc = RegexpTokenizer(_ru_regex, flags=re.IGNORECASE) def __call__(self, text): assert self._proc return [ Token(text[start:end], start, end) for (start, end) in self._proc.span_tokenize(text) ]
def tokenize_mail(self,mailtext): """ this function uses the RegexpTokeniser to split the mail text on the pattern defined Each split is a separate mail. Returns a list of mails contained in the given mailtext """ mails = [] #splits entire mail into parts matching the 'On <Date Time> <*****@*****.**> wrote:' pattern tokenizer = RegexpTokenizer('\n[>|\s]*On[\s]* ([a-zA-Z0-9, :/<>@\.\"\[\]\r\n]*[\s]* wrote:)',gaps = True) mail_indices = tokenizer.span_tokenize(mailtext) #uses the splits' offset information from span_tokenize to split the actual mailcontent #stores each split as an element of a list named 'mails' start = end = 0 for index in mail_indices: end = index[1]+1 mails.append(mailtext[start:end]) start = end return mails #list of the contained mails within a single mailtext
class IndexFile(object): """docstring for IndexFiles""" def __init__(self, id, doc): self.tokenizer = RegexpTokenizer(r'\w+') self.stopwords = set(stopwords.words('english')) self.stemmer = EnglishStemmer() self.doc = doc self.id = id def get_data(self): for start, end in self.tokenizer.span_tokenize(self.doc): token = self.doc[start:end].lower() if token in self.stopwords: continue words = Words(name=token) if word_exists(words.name): words = session.query(Words).filter_by(name=words.name).first() else: session.add(words) session.commit() index = Index(doc_id=self.id, word_id=words.id, beg=start) session.add(index) session.commit()
def generate_feedback(self, methods: list, gazetteers: list): """ Generates feedback according to the medical entities found in the essay with the specified methods and gazetteers Feedback can be found at FEEDBACK_PATH :param methods: list of methods used for annotation :param gazetteers: list of gazetteers that should be taken into account """ # all Feedback transmitted to CASUS is saved here feedbackList = [] feedbackCount = 0 # will be printed in CSV feedback stats file with 1st column the essayID # then for each feedback there are 3 columns: # 0 / 1 (feedback given / not given) # specified terms (main term or synonym associated with found term) # found terms # specified and found terms of multiple conditions are seperated with "&", # various found terms or specified synonyms with ";" feedbackStats = [self.essay_name] # count of the feedback (= line in feedback table) statscount = 0 # save feedback to a file with open( self.file_path + "/" + self.essay_name.split(".")[0] + "_feedback.txt", "w") as feedback_f: tokenizer = RegexpTokenizer( r'[\w-]+' ) # finds all strings with alphanumerical characters or with '-' or '_' (the latter implicitly through \w) token_offsets = list(tokenizer.span_tokenize(self.essay)) # check for each feedback whether it fulfills the constraints (terms existing or missing) for (terms_main, terms_synonyms, feedback, feedbackType) in self.case_feedback[self.caseId]: fulfilled, useful_terms_main_advanced, useful_terms_synonyms_advanced = \ self.check_conditions(terms_main, terms_synonyms, methods, gazetteers) #print("terms: " + str(useful_terms_main_advanced)) #print("syn: " + str(useful_terms_synonyms_advanced)) # if all conditions are fulfilled (e.g. MRT & NOT Roentgen are two conditions) if all(x == True for x in fulfilled): #print("fulfilled") # each fulfilled feedback will be transmitted to CASUS with the following information: # (feedbackID, set of offsets of found terms, set of associated strings, feedback text) # feedbackID is simply a count of all given feedbacks feedback_entry = (feedbackCount, [], [], feedback, feedbackType) # record that feedback was given in feedback stats feedbackStats.append("1") feedbackStats.append("") feedbackStats.append("") # some main terms are made of two different conditions, e.g. MRT & NOT Roentgen (these are two options) # a term made of various words is still one option, e.g. Hepatitis Serologie for i, option in enumerate(terms_main): #print("Option " + str(i) + ": " + ''.join(option)) # the last feedback often has "andere" (others) as the main term and all possible terms as synonyms # in this case, the terms_main contains only the empty set, so the only option is empty # if the option is a negated term (so the term was not found in the essay), # record the negated term and all its synonyms that were not found in the essay in the feedback file if len(option) != 0 and "NOT" in option[0]: feedback_f.write( "\nTRIGGER: None of the following terms detected\n" ) feedback_f.writelines( sorted([ t.strip() + ", " for t in (option + terms_synonyms[i]) ])) # record reason for feedback in feedback stats as the main term that was not found if feedbackStats[statscount * 3 + 2] == "": feedbackStats[statscount * 3 + 2] = ''.join(option) # if this is not the first option else: feedbackStats[statscount * 3 + 2] += " & " + ''.join(option) # if the option is a positive term (and the term or a synonym was found) else: feedback_f.write( "\nTRIGGER: Detection of the following terms\n" ) feedbackString = "" # if the main term is detected, report only this, else report detected synonyms l = useful_terms_main_advanced[i] if len(useful_terms_main_advanced[i]) != 0 \ else useful_terms_synonyms_advanced[i] syn = False if len( useful_terms_main_advanced[i]) != 0 else True # for each option, various terms may have been found (especially regarding synonyms) for found, targetTerm in l: same = False # various words in the text may be associated with a targetTerm, they are listed in found # e.g. ((8, 'körperlicher'), (9, 'untersuchung')) or if only one word ((14, 'labors'),) foundlist = list(found) tokenstring = "" indexlist = [] # get the complete found string and associated word indices (indices of tokenized text) for index, token in foundlist: if tokenstring == "": tokenstring = token else: tokenstring = tokenstring + " " + token indexlist.append(index) # get the start and end character offsets of the found words startchars = token_offsets[indexlist[0]] endchars = token_offsets[indexlist[-1]] start = startchars[0] end = endchars[1] # add the offsets and the found term to the feedback to be transmitted to CASUS # various synonyms may be associated with the same word in the text # in this case don't add the term twice to the list of words that triggered the feedback if not tokenstring in feedback_entry[2]: feedback_entry[1].append((start, end)) feedback_entry[2].append(tokenstring) # if the found term matches exactly the main term (or synonym) # this is recorded with a different wording in the feedback file if tokenstring.lower() == targetTerm.lower(): text_feedback = " wurde im Text gefunden an Position " + str(indexlist) +\ " (" + str(start) + "-" + str(end) + ")" else: text_feedback = " " + str(indexlist) + " (" + str(start) + "-" + str(end) + \ ") im Text wurde assoziiert mit " + targetTerm # various found terms in the essay (e.g. of various synonyms) are separated by ";" pre = "" if feedbackString == "" else "; " feedbackString = feedbackString + pre + tokenstring + text_feedback # option == [] means that the main term is "andere" and the "synonyms" are simply a list of # terms such that if found the feedback is triggered (but we do not wan to call them "synonyms" of andere) if option != [] and syn: feedbackString = feedbackString + " und ist ein Synonym von " + ''.join( option) # record given term and found term for feedback stats if feedbackStats[statscount * 3 + 2] == "": feedbackStats[statscount * 3 + 2] = targetTerm feedbackStats[statscount * 3 + 3] = tokenstring elif pre == "": feedbackStats[statscount * 3 + 2] += "; " + targetTerm feedbackStats[statscount * 3 + 3] += "; " + tokenstring else: feedbackStats[statscount * 3 + 2] += " & " + targetTerm feedbackStats[statscount * 3 + 3] += " & " + tokenstring feedback_f.write(feedbackString) feedback_f.write("\n\nFEEDBACK:\n") feedback_f.write(feedback + "\n\n") feedback_f.write( "_____________________________________________\n") feedbackList.append(feedback_entry) feedbackCount += 1 # if the feedback is not given since conditions not satisfied # this is only recorded in feedback stats - no feedback recorded in feedback file or given to CASUS else: feedbackStats.append("0") feedbackStats.append("") feedbackStats.append("") # some main terms are made of two different conditions, e.g. MRT & NOT Roentgen (these are two options) # a term made of various words is still one option, e.g. Hepatitis Serologie for i, option in enumerate(terms_main): # if this is the "andere" row (thus none of the specified terms was found) # record "andere" as main term (and nothing in the found column) if len(option) == 0: feedbackStats[statscount * 3 + 2] = "andere" else: neg = True if "NOT" in option[0] else False # if no terms for this option found in the essay, i.e. # a) term is negative and satisfied (so not found in essay) # b) term is positive and not satisfied (so not found in essay) # record the main term (and nothing in the found column) # NOTE: satisfied option is possible if various options and one of the others is not satisfied if (fulfilled[i] == True and neg == True) or (fulfilled[i] == False and neg == False): if feedbackStats[statscount * 3 + 2] == "": feedbackStats[statscount * 3 + 2] = ''.join(option) else: feedbackStats[statscount * 3 + 2] += " & " + ''.join(option) # if terms for this option found in the essay # record both the given term and the found version of it else: l = useful_terms_main_advanced[i] if len(useful_terms_main_advanced[i]) != 0 \ else useful_terms_synonyms_advanced[i] for x, (found, targetTerm) in enumerate(l): foundlist = list(found) tokenstring = "" for index, token in foundlist: if tokenstring == "": tokenstring = token else: tokenstring = tokenstring + " " + token if feedbackStats[statscount * 3 + 2] == "": feedbackStats[statscount * 3 + 2] = targetTerm feedbackStats[statscount * 3 + 3] = tokenstring # if this is not the first found term and the list isn't empty elif x > 0: feedbackStats[statscount * 3 + 2] += "; " + targetTerm feedbackStats[statscount * 3 + 3] += "; " + tokenstring # if this is the first found term and the list isn't empty, i.e. another option has # been recorded before else: feedbackStats[statscount * 3 + 2] += " & " + targetTerm feedbackStats[statscount * 3 + 3] += " & " + tokenstring statscount += 1 #print(feedbackStats) # write the feedback stats to the stats CSV file with open(RESULTS_PATH + "stats_case" + self.caseId + ".csv") as inf, open( RESULTS_PATH + "stats_caseNew" + self.caseId + ".csv", 'w') as outf: reader = csv.reader(inf, delimiter=",") writer = csv.writer(outf, delimiter=",") # check if the essay name already exists, i.e. if a user submitted a solution for this case before # if so, the old stats are replaced with the new found = False for line in reader: if line[0] == self.essay_name: writer.writerow(feedbackStats) found = True else: writer.writerow(line) if not found: writer.writerow(feedbackStats) os.remove(RESULTS_PATH + "stats_case" + self.caseId + ".csv") os.rename(RESULTS_PATH + "stats_caseNew" + self.caseId + ".csv", RESULTS_PATH + "stats_case" + self.caseId + ".csv") return feedbackList
def fillout_frames(self, filename_list): #reads in all the xml files and fills the two dataframes with the corresponding values #also creates mapping from tokens and ners to ids #initiate lists data_list = [[ "sentence_id", "token_id", "char_start_id", "char_end_id", "split" ]] ner_list = [["sentence_id", "ner_id", "char_start_id", "char_end_id"]] #initiate word and ner mapping dictionaries self.id2word = {} self.id2ner = {} self.id2ner[0] = 'None' punct = "-,.?!:;" ner_id = 1 word_id = 1 #start reading in the files for filename in filename_list: #get split from pathname and create validation set if 'Test' in str(filename): split = 'test' else: split = random.choices(["train", "val"], weights=(75, 25), k=1)[0] # split train into train #access xml data tree = ET.parse(filename) root = tree.getroot() for elem in root: #get sent_id sent_id = elem.get("id") #get tokens from sentence sentence = elem.get("text") sentence = sentence.replace(";", " ") sentence = sentence.replace("/", " ") tokenizer = RegexpTokenizer("\s|:|;", gaps=True) tokenized = tokenizer.tokenize(sentence) tokenized = [ word.strip(punct) if word[-1] in punct else word for word in tokenized ] span = list(tokenizer.span_tokenize(sentence)) char_ids = [] for tpl in span: char_ids.append((tpl[0], (tpl[1] - 1))) for i, token in enumerate( tokenized): # creating data_df_list, one_sentence if token not in self.id2word.values(): self.id2word[word_id] = token word_id += 1 token_id = self.get_id(token, self.id2word) word_tpl = (sent_id, token_id, int(char_ids[i][0]), int(char_ids[i][1]), split ) # one row in data_df data_list.append(word_tpl) for subelem in elem: if subelem.tag == "entity": #get ner ner = subelem.get("type") #update ner id dict if ner not in self.id2ner.values(): self.id2ner[ner_id] = ner ner_id += 1 label = self.get_id(ner, self.id2ner) #get char_start_id and char_end_id if ";" not in subelem.get("charOffset"): char_start, char_end = subelem.get( "charOffset").split("-") char_start, char_end = int(char_start), int( char_end) #add row in data_df for current entity ner_list.append( [sent_id, label, char_start, char_end]) #if more than one mention of an entity, split into several lines else: occurences = subelem.get("charOffset").split(";") for occurence in occurences: char_start, char_end = occurence.split("-") char_start, char_end = int(char_start), int( char_end) #add row in data_df for current entity ner_list.append( [sent_id, label, char_start, char_end]) self.data_df = pd.DataFrame(data_list[1:], columns=data_list[0]) self.ner_df = pd.DataFrame(ner_list[1:], columns=ner_list[0]) pass
parse_dict("ORG", "PER", glob.glob("resources/Collection5/*.ann"), lambda s : tokenize1(s)) parse_dict("Org", "Person", glob.glob("resources/testset/*.objects"), lambda s : tokenize2(s)) context = et.iterparse("resources/dict.opcorpora.xml", tag='lemma') for (_, element) in context: tag = Tag.NONE lemma = element[0] for g in lemma: if g.attrib['v'] in tag_map: tag = tag_map[g.attrib['v']] if tag != Tag.NONE: for form in element[1:]: root.add([form.attrib['t']], tag) with open("result.txt", "w") as result_file: with open("resources/dataset_40163_1.txt", "r") as dataset: for sentence in dataset: sentence = preprocess(sentence) tokens = [lemmatize(token) for token in tokenizer.tokenize(sentence)] positions = list(tokenizer.span_tokenize(sentence)) current_index = 0 while current_index < len(tokens): (tag, size) = root.get_first_match(tokens[current_index:]) #if tag == Tag.NONE: # (tag, size) = predict(tokens[current_index:]) for index in range(current_index, current_index + size): result_file.write(f"{positions[index][0]} {positions[index][1] - positions[index][0]} {tag.name} ") if tag == Tag.NONE: size = 1 current_index += size result_file.write("EOL\n")
# do_not_remove = [':', '-', '\\', '/'] # remove_punctuation = [p for p in punctuation if p not in do_not_remove] # print remove_punctuation data_dir = "../data/raw_harvard_tlink" labels = [] for file in os.listdir(data_dir): if not file.endswith('.txt'): continue startIndices, endIndices = getStartEndIndices(file) f = open(os.path.join(data_dir, file), 'r') raw = f.read() tokenizer = RegexpTokenizer(patterns) span_generator = tokenizer.span_tokenize(raw) spans = [span for span in span_generator] # tokenizer = RegexpTokenizer(patterns) words = tokenizer.tokenize(raw) chunkStart = False offset = 0 phraseBeginsAt = 0 for i in range(len(words)): startIndex = spans[i][0] + 1 endIndex = spans[i][1] + 1 if str(startIndex) in startIndices and str(endIndex) in endIndices: label = "B-TIMEX3" chunkStart = False elif str(startIndex) in startIndices: label = "B-TIMEX3" chunkStart = True
# only using finditer was i able to obtain the results without greedy search # the function returns an iteratable so we have to use a for loop on it for x in re.finditer(p, line): # x.group(0) contains the regex matched expression wordList.append(x.group(0)) posTagList = nltk.pos_tag(wordList) annotateIndexes = [] with open(fname + '.ann', 'r', encoding='utf8') as fp: for line in fp: wordArr = line.split("\t") indices = wordArr[1].split(" ") annotateIndexes.append(indices) wordSpanIndex = list(re_tokenizer.span_tokenize(s)) index = 0 if not annotate: annotateList = posTagList while index < len(posTagList) and annotate: x = posTagList[index] annotated = False totalNewlines = s[0:wordSpanIndex[index][0]].count('\n\n') spanStart = wordSpanIndex[index][0] # + totalNewlines for annotateObj in annotateIndexes: tag = annotateObj[0]