Esempio n. 1
0
def process_file(source_path, target_path):

    ds = DataSet(target_path)
    with open(source_path, encoding="utf8") as f:
        cur_document = Document()

        for line in f.readlines():
            if line.split()[-1] != "~":
                cur_sentence = []
                cur_anno = []
                anon = 0
                for w in line.split()[1:]:
                    if w == "<ANON>":
                        anon = 1
                    elif w == "</ANON>":
                        anon = 0
                    else:
                        cur_sentence.append(w)
                        cur_anno.append(anon)
                cur_document.sentences.append(list(cur_sentence))
                cur_document.annotated.append(list(cur_anno))

            elif cur_document.sentences:
                ds.add_document(cur_document)
                cur_document = Document()

        if cur_document.sentences:
            ds.add_document(cur_document)

    #  ds.write_data()
    print(ds.word_count, ds.annotation_count)
    return ds
Esempio n. 2
0
def prepare_keyphrase_data(clf_model, data_dir):
    all_data_files = os.listdir(data_dir)
    main_data_files = list(
        set([datafile.split(".")[-2] for datafile in all_data_files]))

    documents = {}
    targets = {}

    for f_name in main_data_files:
        with open(data_dir + "/" + f_name + ".txt",
                  encoding="ISO-8859-1") as f:
            # TODO maybe replace " with \"
            raw_text = ". ".join([str(line).strip() for line in f.readlines()])
            doc = Document(raw_text=raw_text)
            doc.create_from_text()
            doc.annotated = clf_model._annotate_document(doc)
            documents[f_name] = doc
        with open(data_dir + "/" + f_name + ".key",
                  encoding="ISO-8859-1") as f:
            t_list = []
            for line in f.readlines():
                t_list.append(str(line).strip())
            targets[f_name] = t_list

    return documents, targets
Esempio n. 3
0
def prepare_data(clf_model, data_path):
    """
    loads and anonymizes test data
    :param BasicClassifier clf_model:
    :param data_path:
    :return: dataset containing test data with anon annotations
    """
    ds = DataSet(path="")

    with open(data_path, encoding="ISO-8859-1") as f:
        for line in f.readlines():
            raw_text = str(line)
            doc = Document(raw_text=raw_text)
            doc.create_from_text()
            doc.annotated = clf_model._annotate_document(doc)
            ds.add_document(document=doc)
    print("Number of annotated words:", ds.annotation_count)
    return ds
Esempio n. 4
0
def process_file(source_path, target_path):

    ds = DataSet(target_path)
    with open(source_path, encoding="utf8") as f:
        cur_document = Document()
        cur_sentence = []
        cur_anno = []

        for line in f.readlines():
            if len(line) > 1:
                temp_word_list = line.split()
                if temp_word_list[0] == "-DOCSTART-":
                    if cur_document.sentences:
                        ds.add_document(cur_document)
                        cur_document = Document()
                else:
                    cur_sentence.append(temp_word_list[0])
                    anno = temp_word_list[-1]
                    if anno[0] == "B":  # marks the first word of a named entity
                        cur_anno.append(1)
                    elif anno[0] == "I":  # marks any word of a named entity except the first
                        cur_anno.append(2)
                    elif temp_word_list[1] == "CD":  # marks dates and numbers
                        cur_anno.append(1)
                    else:
                        cur_anno.append(0)
            elif cur_sentence:
                cur_document.sentences.append(list(cur_sentence))
                cur_document.annotated.append(list(cur_anno))
                cur_sentence = []
                cur_anno = []

        if cur_document.sentences:
            ds.add_document(cur_document)

    ds.write_data()
    print(ds.word_count, ds.annotation_count)
    def annotate_string(self, input_text):
        doc = Document(raw_text=input_text)
        doc.create_from_text()
        ds = DataSet("")
        ds.add_document(doc)
        embedded = self.data_helper.embed_dataset(ds)[0]

        predicted = []
        for sent_emb in embedded:
            data = np.array(sent_emb)
            p = self.model.predict(data)
            predicted.append(p)
        doc.annotated = predicted
        return doc.create_text()
 def annotate_string(self, input_text):
     doc = Document(raw_text=input_text)
     doc.create_from_text()
     annotated = self._annotate_document(doc)
     doc.annotated = annotated
     return doc.create_text()