def read_conll_2_column_data(path_to_conll_file: str, tag_name: str): sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() else: # print(line) fields: List[str] = re.split("\s+", line) token = Token(fields[0]) token.add_tag(tag_name, fields[1]) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def read_germeval(path_to_conll_file: str, tag_scheme='iob') -> List[Sentence]: sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: if line.startswith('#'): continue elif line == '': if len(sentence.tokens) > 0: sentence.convert_tag_scheme(target_scheme=tag_scheme) sentences.append(sentence) sentence: Sentence = Sentence() else: fields: List[str] = re.split("\s+", line) token = Token(fields[1]) token.add_tag('ner', fields[2]) sentence.add_token(token) if len(sentence.tokens) > 0: sentence.convert_tag_scheme(target_scheme=tag_scheme) sentences.append(sentence) return sentences
def _convert_to_flair(self, data, labels=None): """ Convert data and labels into a list of flair.data.Sentence objects. Parameters ---------- data : list(list(str)) list of list of tokens, each inner list represents a list of tokens or words in sentence, and each outer list represents a sentence. labels : list(list(str)), can be None list of list of NER tags corresponding to tokens in data. Returns ------- sentences : list(flair.data.Sentence) """ sentences = [] if labels is None: labels = data use_dummy_labels = True else: use_dummy_labels = False for tokens, tags in zip(data, labels): sentence = Sentence() for token, tag in zip(tokens, tags): t = Token(token) if not use_dummy_labels: t.add_tag("ner", tag) sentence.add_token(t) sentences.append(sentence) return sentences
def process_conll_doc(input_file_name, output_file_name): columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: myjson = { "text": unidecode.unidecode(d.to_tokenized_string()), "spans": [] } res = requests.post(NEURAL_EL_SERVER, json=myjson) info = res.json() #print(info) for i in info: entity_ran = range(i[0], i[0] + i[1]) #print(i[2] + " " + str(entity_ran)) for t in d.tokens: #print(t.text + " " + str(t.start_pos)) if t.start_position in entity_ran: #print("found tag") t.add_tag("pnme", i[2]) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def tag_it(token: Token, index, ner_spans): labels = [(start, end, label) for start, end, label in ner_spans if index >= start and index <= end] if len(labels) > 0: for start, end, label in labels: token.add_tag(TAG_TYPE, prefix_to_BIOES(label, start, end, index)) else: token.add_tag(TAG_TYPE, 'O')
def test_sentence_to_tagged_string(): token1 = Token('I', 0) token2 = Token('love', 1, 0) token3 = Token('Berlin', 2, 1) token3.add_tag('ner', 'LOC') sentence = Sentence() sentence.add_token(token1) sentence.add_token(token2) sentence.add_token(token3) assert ('I love Berlin <LOC>' == sentence.to_tagged_string())
def read_column_data(path_to_column_file: Path, column_name_map: Dict[int, str], infer_whitespace_after: bool = True): """ Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third the chunk and the forth the NER tag. :param path_to_column_file: the path to the column file :param column_name_map: a map of column number to token annotation name :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token :return: list of sentences """ sentences: List[Sentence] = [] try: lines: List[str] = open(str(path_to_column_file), encoding='utf-8').read().strip().split('\n') except: log.info('UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(path_to_column_file)) lines: List[str] = open(str(path_to_column_file), encoding='latin1').read().strip().split('\n') # most data sets have the token text in the first column, if not, pass 'text' as column text_column: int = 0 for column in column_name_map: if column_name_map[column] == 'text': text_column = column sentence: Sentence = Sentence() for line in lines: if line.startswith('#'): continue if line.strip().replace('', '') == '': if len(sentence) > 0: sentence.infer_space_after() sentences.append(sentence) sentence: Sentence = Sentence() else: fields: List[str] = re.split("\s+", line) token = Token(fields[text_column]) for column in column_name_map: if len(fields) > column: if column != text_column: token.add_tag(column_name_map[column], fields[column]) sentence.add_token(token) if len(sentence.tokens) > 0: sentence.infer_space_after() sentences.append(sentence) return sentences
def read_group_file(path_to_file, entities): sentences: List[Sentence] = [] for line in open(path_to_file): sentence: Sentence = Sentence() labels_data, text = line.rstrip().split('\t') labels, tokens = data_to_bio(labels_data, text, entities) for label, token in zip(labels, tokens): token = Token(token) token.add_tag('ner', label) sentence.add_token(token) sentences.append(sentence) return sentences
def test_sentence_to_tagged_string(): token1 = Token("I", 0) token2 = Token("love", 1, 0) token3 = Token("Berlin", 2, 1) token3.add_tag("ner", "LOC") sentence: Sentence = Sentence() sentence.add_token(token1) sentence.add_token(token2) sentence.add_token(token3) assert "I love Berlin <LOC>" == sentence.to_tagged_string()
def read_column_data(path_to_column_file: str, column_name_map: Dict[int, str]): """ Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third the chunk and the forth the NER tag. :param path_to_column_file: the path to the column file :param column_name_map: a map of column number to token annotation name :return: list of sentences """ sentences: List[Sentence] = [] lines: List[str] = open(path_to_column_file).read().strip().split('\n') # most data sets have the token text in the first column, if not, pass 'text' as column text_column: int = 0 for column in column_name_map: if column_name_map[column] == 'text': text_column = column sentence: Sentence = Sentence() for line in lines: if line.startswith('#'): continue if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() else: fields: List[str] = re.split("\s+", line) token = Token(fields[text_column]) for column in column_name_map: if len(fields) > column: if column != text_column: token.add_tag(column_name_map[column], fields[column]) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def standoff_to_flair_sents( docs: List[Document], tokenizer: Tokenizer, verbose=False) -> Tuple[List[Sentence], List[ParsedDoc]]: sents, parsed_docs = standoff_to_sents(docs=docs, tokenizer=tokenizer, verbose=verbose) flair_sents = [] for sent in sents: flair_sent = Sentence() for token in sent: tok = Token(token.text) tok.add_tag(tag_type='ner', tag_value=token.label) flair_sent.add_token(tok) flair_sents.append(flair_sent) return flair_sents, parsed_docs
def __getitem__(self, index: int = 0) -> Sentence: if self.in_memory: sentence = self.sentences[index] else: with open(str(self.path_to_column_file), encoding=self.encoding) as file: file.seek(self.indices[index]) line = file.readline() sentence: Sentence = Sentence() while line: if self.comment_symbol is not None and line.startswith( self.comment_symbol): line = file.readline() continue if self.__line_completes_sentence(line): if len(sentence) > 0: sentence.infer_space_after() if self.tag_to_bioes is not None: sentence.convert_tag_scheme( tag_type=self.tag_to_bioes, target_scheme="iobes") return sentence else: fields: List[str] = re.split("[\t\n]", line) token = Token(fields[self.text_column]) for column in self.column_name_map: if len(fields) > column: if column != self.text_column: token.add_tag(self.column_name_map[column], fields[column]) if not line.isspace(): sentence.add_token(token) line = file.readline() return sentence
def read_conll_sequence_labeling_data(path_to_conll_file: str): sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() else: # print(line) fields: List[str] = re.split("\s+", line) token = Token(fields[0]) token.add_tag('pos', fields[1]) token.add_tag('np', fields[2]) if len(fields) > 3: token.add_tag('ner', fields[3]) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def read_conll_ud(path_to_conll_file: str) -> List[Sentence]: """ Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation :param path_to_conll_file: the path to the conll-u file :return: list of sentences """ sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: fields: List[str] = re.split("\s+", line) if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() elif line.startswith('#'): continue elif '.' in fields[0]: continue elif '-' in fields[0]: continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_tag('lemma', str(fields[2])) token.add_tag('upos', str(fields[3])) token.add_tag('pos', str(fields[4])) token.add_tag('dependency', str(fields[7])) for morph in str(fields[5]).split('|'): if not "=" in morph: continue; token.add_tag(morph.split('=')[0].lower(), morph.split('=')[1]) if len(fields) > 10 and str(fields[10]) == 'Y': token.add_tag('frame', str(fields[11])) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def train(self, intent_fst) -> None: from flair.data import Sentence, Token from flair.models import SequenceTagger, TextClassifier from flair.embeddings import ( FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, ) from flair.data import TaggedCorpus from flair.trainers import ModelTrainer # Directory to look for downloaded embeddings cache_dir = self.profile.read_path( self.profile.get("intent.flair.cache_dir", "flair/cache") ) os.makedirs(cache_dir, exist_ok=True) # Directory to store generated models data_dir = self.profile.write_path( self.profile.get("intent.flair.data_dir", "flair/data") ) if os.path.exists(data_dir): shutil.rmtree(data_dir) self.embeddings = self.profile.get("intent.flair.embeddings", []) assert len(self.embeddings) > 0, "No word embeddings" # Create directories to write training data to class_data_dir = os.path.join(data_dir, "classification") ner_data_dir = os.path.join(data_dir, "ner") os.makedirs(class_data_dir, exist_ok=True) os.makedirs(ner_data_dir, exist_ok=True) # Convert FST to training data class_data_path = os.path.join(class_data_dir, "train.txt") ner_data_path = os.path.join(ner_data_dir, "train.txt") # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = {} # Get sentences for training do_sampling = self.profile.get("intent.flair.do_sampling", True) start_time = time.time() if do_sampling: # Sample from each intent FST num_samples = int(self.profile.get("intent.flair.num_samples", 10000)) intent_map_path = self.profile.read_path( self.profile.get("training.intent.intent_map", "intent_map.json") ) with open(intent_map_path, "r") as intent_map_file: intent_map = json.load(intent_map_file) # Gather FSTs for all known intents fsts_dir = self.profile.write_dir( self.profile.get("speech_to_text.fsts_dir") ) intent_fst_paths = { intent_id: os.path.join(fsts_dir, f"{intent_id}.fst") for intent_id in intent_map.keys() } # Generate samples self._logger.debug( f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)" ) sentences_by_intent = sample_sentences_by_intent( intent_fst_paths, num_samples ) else: # Exhaustively generate all sentences self._logger.debug( "Generating all possible sentences (may take a long time)" ) sentences_by_intent = make_sentences_by_intent(intent_fst) sentence_time = time.time() - start_time self._logger.debug(f"Generated sentences in {sentence_time} second(s)") # Get least common multiple in order to balance sentences by intent lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values())) # Generate examples class_sentences = [] ner_sentences: Dict[str, List[Sentence]] = defaultdict(list) for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for intent_sent in intent_sents: # Only train an intent classifier if there's more than one intent if len(sentences_by_intent) > 1: # Add balanced copies for i in range(num_repeats): class_sent = Sentence(labels=[intent_name]) for word in intent_sent["tokens"]: class_sent.add_token(Token(word)) class_sentences.append(class_sent) if len(intent_sent["entities"]) == 0: continue # no entities, no sequence tagger # Named entity recognition (NER) example token_idx = 0 entity_start = {ev["start"]: ev for ev in intent_sent["entities"]} entity_end = {ev["end"]: ev for ev in intent_sent["entities"]} entity = None word_tags = [] for word in intent_sent["tokens"]: # Determine tag label tag = "O" if not entity else f"I-{entity}" if token_idx in entity_start: entity = entity_start[token_idx]["entity"] tag = f"B-{entity}" word_tags.append((word, tag)) # word ner token_idx += len(word) + 1 if (token_idx - 1) in entity_end: entity = None # Add balanced copies for i in range(num_repeats): ner_sent = Sentence() for word, tag in word_tags: token = Token(word) token.add_tag("ner", tag) ner_sent.add_token(token) ner_sentences[intent_name].append(ner_sent) # Start training max_epochs = int(self.profile.get("intent.flair.max_epochs", 100)) # Load word embeddings self._logger.debug(f"Loading word embeddings from {cache_dir}") word_embeddings = [ FlairEmbeddings(os.path.join(cache_dir, "embeddings", e)) for e in self.embeddings ] if len(class_sentences) > 0: self._logger.debug("Training intent classifier") # Random 80/10/10 split class_train, class_dev, class_test = self._split_data(class_sentences) class_corpus = TaggedCorpus(class_train, class_dev, class_test) # Intent classification doc_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( doc_embeddings, label_dictionary=class_corpus.make_label_dictionary(), multi_label=False, ) self._logger.debug( f"Intent classifier has {len(class_sentences)} example(s)" ) trainer = ModelTrainer(classifier, class_corpus) trainer.train(class_data_dir, max_epochs=max_epochs) else: self._logger.info("Skipping intent classifier training") if len(ner_sentences) > 0: self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)") # Named entity recognition stacked_embeddings = StackedEmbeddings(word_embeddings) for intent_name, intent_ner_sents in ner_sentences.items(): ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents) ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test) tagger = SequenceTagger( hidden_size=256, embeddings=stacked_embeddings, tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"), tag_type="ner", use_crf=True, ) ner_intent_dir = os.path.join(ner_data_dir, intent_name) os.makedirs(ner_intent_dir, exist_ok=True) self._logger.debug( f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)" ) trainer = ModelTrainer(tagger, ner_corpus) trainer.train(ner_intent_dir, max_epochs=max_epochs) else: self._logger.info("Skipping NER sequence tagger training")
def process_conll_doc(input_file_name, output_file_name, ner_model, with_disambiguation, sim_level_disambig): nertagger = SequenceTagger.load(ner_model) columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: nertagger.predict(d) spans = [] for nerspan in d.get_spans('ner'): start = nerspan.start_pos length = nerspan.end_pos - nerspan.start_pos spans.append({"start": start, "length": length}) myjson = { "text": unidecode.unidecode(d.to_tokenized_string()), "spans": spans } res = requests.post(NEURAL_EL_SERVER, json=myjson) info = res.json() for nerspan in d.get_spans('ner'): for i in info: if i[0] == nerspan.start_pos: for t in nerspan.tokens: t.add_tag("pnme", i[2]) break if with_disambiguation: searcher = load_disambiguation() for nerspan in d.get_spans('ner'): if "pnme" not in nerspan.tokens[0].tags: #print("calling with " + nerspan.text) r = searcher.search(nerspan.text.lower(), sim_level_disambig) #print(r) if len(r) > 0: d_tag = unidecode.unidecode( (string.capwords(r[0]) + "_(disambiguation)").replace(" ", "_")) for t2 in nerspan.tokens: t2.add_tag("pnme", d_tag) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def create_sentlist_from_file_batchmax(self, data, maxlen=64, compare_column="cat"): """ takes a pandas dataframe with columns 'tok' and 'sentstart' and creates a list of flair Sentence objects with tags. Each flair Sentence object may contain several real sentences, but at most maxlen tokens. The Sentence object stops at a sentence boundary, so it is often shorter than maxlen. Sentences longer than maxlen are split! If a line with token value "EOF" is encountered, a shorter flair Sentence object is returned, so no file boundaries are crossed :param data_path: :return: """ sent_list = [] toklist = [] catlist = [] # the len_last_token is needed to add proper start/end pos for each sentence token len_last_token = 0 # track the sentence that is currently being processed curr_sentence_tok = [] curr_sentence_cat = [] for index, row in data.iterrows(): tok = str(row["tok"]) if compare_column != "NaN": cat = str(row[compare_column]) else: cat = "-" # if the current token is "EOF" this marks the end of sample file # chunks may not cross file boundaries, therefore end the sentence here in any case if tok == "EOF": # do not add this token to any list # merge toklist and curr_sentence_tok list to get all current tokens # and create a flair sentence toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) self.logger.debug( "create chunk at EOF with (len: {}): {}".format( len(toklist), toklist)) self.logger.debug("catlist with (len: {}): {}".format( len(catlist), catlist)) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", catlist[i]) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 toklist = [] catlist = [] # reset the curr sent lists as well curr_sentence_tok = [] curr_sentence_cat = [] else: # if we are at the start of a new sentence, add the contents of curr_sentence_tok # and curr_sentence_cat to the main lists and start a new curr_sentence if row["sentstart"] == "yes": toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) curr_sentence_tok = [tok] curr_sentence_cat = [cat] else: curr_sentence_tok.append(tok) curr_sentence_cat.append(cat) # if the combined length of toklist and curr_sentence_tok is > maxlen now, # create a flair sentence with the tokens in toklist and reset it # the remaining tokens in curr_sentence_tok are saved for the next chunk if len(toklist) + len(curr_sentence_tok) > maxlen: # if toklist is empty at this point, we have a sentence > maxlen # and must split it. The last token currently in curr_sentence will # be preserved for later so that the chunk is not too long if len(toklist) == 0: toklist.extend(curr_sentence_tok[0:-1]) catlist.extend(curr_sentence_cat[0:-1]) curr_sentence_tok = [curr_sentence_tok[-1]] curr_sentence_cat = [curr_sentence_cat[-1]] self.logger.debug( "Sentence is split (len: {}): {}".format( len(toklist), toklist)) self.logger.debug("create chunk with (len: {}): {}".format( len(toklist), toklist)) self.logger.debug("catlist with (len: {}): {}".format( len(catlist), catlist)) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", str(catlist[i])) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 toklist = [] catlist = [] self.logger.debug("toklist: {}, curr_sent_tok: {}".format( len(toklist), len(curr_sentence_tok))) # if the loop is complete, empty the buffers and add them to the list if len(curr_sentence_tok) > 0: toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", str(catlist[i])) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 return sent_list
def process_conll_doc(input_file_name, output_file_name, ner_model, with_disambiguation, sim_level_disambig): nertagger = SequenceTagger.load(ner_model) columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: nertagger.predict(d) centity = [] newsent = [] for token in d: #print(token) nertag = token.get_tag("ner").value #print(token.text + " " + nertag) if nertag[0:2] in ['B-', 'S-']: if len(centity) != 0: newsent.append("<entity>" + " ".join(centity) + "</entity>") centity = [] centity.append(token.text) if nertag[0:2] in ['E-', 'I-']: centity.append(token.text) if nertag == "O": if len(centity) != 0: newsent.append("<entity>" + " ".join(centity) + "</entity>") centity = [] newsent.append(token.text) sent_for_ag = " ".join(newsent) agres = ag.disambiguate(sent_for_ag) for entity in d.get_spans('ner'): for r in agres: if r["namedEntity"] == entity.text: for t in entity.tokens: t.add_tag("pnme", r["disambiguatedURL"]) break if with_disambiguation: searcher = load_disambiguation() for nerspan in d.get_spans('ner'): if "pnme" not in nerspan.tokens[0].tags: #print("calling with " + nerspan.text) r = searcher.search(nerspan.text.lower(), sim_level_disambig) #print(r) if len(r) > 0: d_tag = unidecode.unidecode( (string.capwords(r[0]) + "_(disambiguation)").replace(" ", "_")) for t2 in nerspan.tokens: t2.add_tag("pnme", d_tag) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def test_check_input(self): """ Test for check_input function """ phone_sigs = [ 'cell', 'Cell', 'phone', 'Phone', 'Phone/fax', 'phone/fax', 'Phone/Fax' ] fax_sigs = ['Fax', 'fax'] # Check for email address sentence = Sentence() token = Token('hello') tag = 'S-email_id' token.add_tag('ner', tag) sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value self.assertNotEqual(return_val, tag) token = Token('*****@*****.**') sentence.add_token(token) app.check_input(sentence) return_val = sentence[1].get_tag('ner').value self.assertEqual(return_val, tag) token = Token('*****@*****.**') sentence.add_token(token) app.check_input(sentence) return_val = sentence[2].get_tag('ner').value self.assertNotEqual(return_val, tag) # Check for phone number for sig in phone_sigs: sentence = Sentence() token = Token(sig) tag = 'S-phone' token.add_tag('ner', tag) sentence.add_token(token) token = Token('123-456-7890') sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value self.assertNotEqual(return_val, tag) return_val = sentence[1].get_tag('ner').value self.assertEqual(return_val, tag) # Check for fax number for sig in fax_sigs: sentence = Sentence() token = Token(sig) tag = 'S-fax' token.add_tag('ner', tag) sentence.add_token(token) token = Token('123-456-7890') sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value self.assertNotEqual(return_val, tag) return_val = sentence[1].get_tag('ner').value self.assertEqual(return_val, tag) # Check for zipcode num = '' for i in range(10): num += str(i) sentence = Sentence() token = Token(num) tag = 'S-zipcode' sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value if len(num) == 5: self.assertEqual(return_val, tag) else: self.assertNotEqual(return_val, tag)
def __init__( self, path_to_column_file: Path, column_name_map: Dict[int, str], tag_to_bioes: str = None, comment_symbol: str = None, in_memory: bool = True, document_separator_token: str = None, encoding: str = "utf-8", ): """ Instantiates a column dataset (typically used for sequence labeling or word-level prediction). :param path_to_column_file: path to the file with the column-formatted data :param column_name_map: a map specifying the column format :param tag_to_bioes: whether to convert to BIOES tagging scheme :param comment_symbol: if set, lines that begin with this symbol are treated as comments :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token that indicates that a new document begins """ assert path_to_column_file.exists() self.path_to_column_file = path_to_column_file self.tag_to_bioes = tag_to_bioes self.column_name_map = column_name_map self.comment_symbol = comment_symbol self.document_separator_token = document_separator_token # store either Sentence objects in memory, or only file offsets self.in_memory = in_memory if self.in_memory: self.sentences: List[Sentence] = [] else: self.indices: List[int] = [] self.total_sentence_count: int = 0 # most data sets have the token text in the first column, if not, pass 'text' as column self.text_column: int = 0 for column in self.column_name_map: if column_name_map[column] == "text": self.text_column = column # determine encoding of text file self.encoding = encoding sentence: Sentence = Sentence() with open(str(self.path_to_column_file), encoding=self.encoding) as f: line = f.readline() position = 0 while line: if self.comment_symbol is not None and line.startswith( comment_symbol): line = f.readline() continue if self.__line_completes_sentence(line): if len(sentence) > 0: sentence.infer_space_after() if self.in_memory: if self.tag_to_bioes is not None: sentence.convert_tag_scheme( tag_type=self.tag_to_bioes, target_scheme="iobes") self.sentences.append(sentence) else: self.indices.append(position) position = f.tell() self.total_sentence_count += 1 sentence: Sentence = Sentence() else: fields: List[str] = re.split("[\t\n]", line) token = Token(fields[self.text_column]) for column in column_name_map: if len(fields) > column: if column != self.text_column: token.add_tag(self.column_name_map[column], fields[column]) if not line.isspace(): sentence.add_token(token) line = f.readline() if len(sentence.tokens) > 0: sentence.infer_space_after() if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position) self.total_sentence_count += 1
def read_conll_ud(path_to_conll_file: Path) -> List[Sentence]: """ Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation :param path_to_conll_file: the path to the conll-u file :return: list of sentences """ sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file, encoding="utf-8").read().strip().split("\n") sentence: Sentence = Sentence() for line in lines: fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() elif line.startswith("#"): continue elif "." in fields[0]: continue elif "-" in fields[0]: continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_tag("lemma", str(fields[2])) token.add_tag("upos", str(fields[3])) token.add_tag("pos", str(fields[4])) token.add_tag("dependency", str(fields[7])) for morph in str(fields[5]).split("|"): if not "=" in morph: continue token.add_tag( morph.split("=")[0].lower(), morph.split("=")[1]) if len(fields) > 10 and str(fields[10]) == "Y": token.add_tag("frame", str(fields[11])) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def read_conll_ud(path_to_conll_file: str) -> List[Sentence]: sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: fields: List[str] = re.split("\s+", line) if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() elif line.startswith('#'): continue elif '.' in fields[0]: continue elif '-' in fields[0]: continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_tag('lemma', str(fields[2])) token.add_tag('upos', str(fields[3])) token.add_tag('pos', str(fields[4])) token.add_tag('dependency', str(fields[7])) for morph in str(fields[5]).split('|'): if not "=" in morph: continue token.add_tag( morph.split('=')[0].lower(), morph.split('=')[1]) if len(fields) > 10 and str(fields[10]) == 'Y': token.add_tag('frame', str(fields[11])) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences