def process_file(source_path, target_path): ds = DataSet(target_path) with open(source_path, encoding="utf8") as f: cur_document = Document() for line in f.readlines(): if line.split()[-1] != "~": cur_sentence = [] cur_anno = [] anon = 0 for w in line.split()[1:]: if w == "<ANON>": anon = 1 elif w == "</ANON>": anon = 0 else: cur_sentence.append(w) cur_anno.append(anon) cur_document.sentences.append(list(cur_sentence)) cur_document.annotated.append(list(cur_anno)) elif cur_document.sentences: ds.add_document(cur_document) cur_document = Document() if cur_document.sentences: ds.add_document(cur_document) # ds.write_data() print(ds.word_count, ds.annotation_count) return ds
def prepare_keyphrase_data(clf_model, data_dir): all_data_files = os.listdir(data_dir) main_data_files = list( set([datafile.split(".")[-2] for datafile in all_data_files])) documents = {} targets = {} for f_name in main_data_files: with open(data_dir + "/" + f_name + ".txt", encoding="ISO-8859-1") as f: # TODO maybe replace " with \" raw_text = ". ".join([str(line).strip() for line in f.readlines()]) doc = Document(raw_text=raw_text) doc.create_from_text() doc.annotated = clf_model._annotate_document(doc) documents[f_name] = doc with open(data_dir + "/" + f_name + ".key", encoding="ISO-8859-1") as f: t_list = [] for line in f.readlines(): t_list.append(str(line).strip()) targets[f_name] = t_list return documents, targets
def prepare_data(clf_model, data_path): """ loads and anonymizes test data :param BasicClassifier clf_model: :param data_path: :return: dataset containing test data with anon annotations """ ds = DataSet(path="") with open(data_path, encoding="ISO-8859-1") as f: for line in f.readlines(): raw_text = str(line) doc = Document(raw_text=raw_text) doc.create_from_text() doc.annotated = clf_model._annotate_document(doc) ds.add_document(document=doc) print("Number of annotated words:", ds.annotation_count) return ds
def process_file(source_path, target_path): ds = DataSet(target_path) with open(source_path, encoding="utf8") as f: cur_document = Document() cur_sentence = [] cur_anno = [] for line in f.readlines(): if len(line) > 1: temp_word_list = line.split() if temp_word_list[0] == "-DOCSTART-": if cur_document.sentences: ds.add_document(cur_document) cur_document = Document() else: cur_sentence.append(temp_word_list[0]) anno = temp_word_list[-1] if anno[0] == "B": # marks the first word of a named entity cur_anno.append(1) elif anno[0] == "I": # marks any word of a named entity except the first cur_anno.append(2) elif temp_word_list[1] == "CD": # marks dates and numbers cur_anno.append(1) else: cur_anno.append(0) elif cur_sentence: cur_document.sentences.append(list(cur_sentence)) cur_document.annotated.append(list(cur_anno)) cur_sentence = [] cur_anno = [] if cur_document.sentences: ds.add_document(cur_document) ds.write_data() print(ds.word_count, ds.annotation_count)
def annotate_string(self, input_text): doc = Document(raw_text=input_text) doc.create_from_text() ds = DataSet("") ds.add_document(doc) embedded = self.data_helper.embed_dataset(ds)[0] predicted = [] for sent_emb in embedded: data = np.array(sent_emb) p = self.model.predict(data) predicted.append(p) doc.annotated = predicted return doc.create_text()
def annotate_string(self, input_text): doc = Document(raw_text=input_text) doc.create_from_text() annotated = self._annotate_document(doc) doc.annotated = annotated return doc.create_text()