def read_txt(file: str, number: int = -1):

    insts = []

    digit2zero = False
    insts = []
    sentences = []
    predictions = []

    sentence = ''
    sections = []

    prediction = []
    true_label = []

    with open(file, 'r', encoding='utf-8') as f:

        for line in f:
            line = line.strip()
            if line == '':
                if sentence.strip() != '':
                    sentences.append(' '.join(sentence.split(' ')[1:]))
                    sections.append(prediction[0])
                    predictions.append(prediction[1:])
                    inst = Instance(input=Sentence(words=sentence.split()[1:],
                                                   heading=prediction[0]))
                    inst.output = true_label
                    inst.prediction = prediction[1:]
                    insts.append(inst)

                    # print(len(sentence.split()[1:]))
                    # print(len(label[1:]))
                    # print(len(output))

                sentence = ''
                prediction = []
                true_label = []
            else:
                if digit2zero:
                    sentence += re.sub('\d', '0', line.split()[0]) + ' '
                else:
                    sentence += line.split()[0] + ' '
                prediction.append(line.split()[1])
                if len(line.split()) == 3:
                    true_label.append(line.split()[2])

        if sentence.strip() != '':
            sentences.append(' '.join(sentence.split(' ')[1:]))
            sections.append(prediction[0])
            predictions.append(prediction[1:])
            inst = Instance(input=Sentence(words=sentence.split()[1:],
                                           heading=prediction[0]))
            inst.output = true_label
            inst.prediction = prediction[1:]
            insts.append(inst)
            # print(len(sentence.split()[1:]))
            # print(len(label[1:]))
            # print(len(output))

    return sentences, sections, predictions, insts
Esempio n. 2
0
 def distance(self, s1: Sentence, s2: Sentence) -> float:
     return [
         lcs.llcs(
             s1.lowercase_tokens(),
             s2.lowercase_tokens(),
         )
     ]
Esempio n. 3
0
 def distance(self, s1: Sentence, s2: Sentence) -> float:
     return [
         self.wv_levenshtein(
             s1.lowercase_tokens(),
             s2.lowercase_tokens(),
         )
     ]
Esempio n. 4
0
 def distance(self, s1: Sentence, s2: Sentence) -> float:
     return [
         levenshtein(
             s1.lowercase_tokens(),
             s2.lowercase_tokens(),
             self.del_cost,
             self.insert_cost,
             self.subs_cost,
             self.normalize,
         )
     ]
Esempio n. 5
0
 def postprocess(self):
     self.data = list()
     senttokens = list()
     for token in self.output_data:
         if token == "</s>":
             self.data.append(Sentence(senttokens))
             senttokens = list()
         else:
             tok, tag, lemma = token.split("\t")
             senttokens.append(Token(word=tok, xpos=tag, lemma=lemma))
     if senttokens:  # add last sentence
         self.data.append(Sentence(senttokens))
Esempio n. 6
0
 def read_txt(self, file: str, number: int = -1) -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 inst = Instance(Sentence(words), labels)
                 inst.set_id(len(insts))
                 insts.append(inst)
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             word, label = line.split('<|>')
             if self.digit2zero:
                 word = re.sub('\d', '0', word)  # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
     print("number of sentences: {}".format(len(insts)))
     return insts
Esempio n. 7
0
 def read_txt(self,
              file: str,
              number: int = -1,
              category: str = "all") -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         ori_words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 if category == "all" or words[0] == category:
                     insts.append(
                         Instance(Sentence(words[1:], ori_words[1:]),
                                  labels[1:]))
                 words = []
                 ori_words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             ls = line.split()
             word, label = ls[0], ls[-1]
             if len(ls) == 1:
                 label = "O"
             ori_words.append(word)
             if self.digit2zero:
                 word = re.sub('\d', '0', word)  # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
     print("number of sentences: {}".format(len(insts)))
     return insts
def read_extraction_results(file: str,
                            number: int = -1,
                            digit2zero: bool = True) -> List[Instance]:
    print("Reading file: " + file)
    insts = []
    with open(file, 'r', encoding='utf-8') as f:
        words = []
        labels = []
        ground_truth = []
        for line in tqdm(f.readlines()):
            line = line.rstrip()
            if line == "":
                inst = Instance(Sentence(words), labels)
                inst.ground_truth = ground_truth
                insts.append(inst)
                words = []
                labels = []
                ground_truth = []
                if len(insts) == number:
                    break
                continue
            _, word, gold_label, predicted_segment_label = line.split()
            if digit2zero:
                word = re.sub('\d', '0', word)  # replace digit with 0.
            words.append(word)
            labels.append(predicted_segment_label)
            ground_truth.append(gold_label)
    print("number of sentences: {}".format(len(insts)))
    return insts
Esempio n. 9
0
    def read_txt_with_extraction(self, file: str, extraction_file: str, number: int = -1) -> List[Instance]:
        print("Reading file: " + file)
        print("Reading file: " + extraction_file)
        f_extract = open(extraction_file, 'r', encoding='utf-8')
        extract_lines = f_extract.readlines()
        i = -1
        insts = []
        with open(file, 'r', encoding='utf-8') as f:
            words = []
            labels = []
            boundaries = []
            for line in tqdm(f.readlines()):
                i += 1
                extract_line = extract_lines[i]
                extract_line = extract_line.rstrip()
                line = line.rstrip()
                if line == "":
                    insts.append(Instance(Sentence(words), labels, boundaries))
                    words = []
                    labels = []
                    boundaries = []
                    if len(insts) == number:
                        break
                    continue
                word, label = line.split()
                _, word_, gold_, predicted_label = extract_line.split()
                if self.digit2zero:
                    word = re.sub('\d', '0', word) # replace digit with 0.
                words.append(word)
                self.vocab.add(word)
                labels.append(label)
                boundaries.append(predicted_label)

        print("number of sentences: {}".format(len(insts)))
        return insts
Esempio n. 10
0
 def read_conll(self, file: str, number: int = -1, is_train: bool = True) -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     num_entity = 0
     # vocab = set() ## build the vocabulary
     find_root = False
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 insts.append(Instance(Sentence(words), labels))
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             vals = line.split()
             word = vals[1]
             label = vals[10]
             if self.digit2zero:
                 word = re.sub('\d', '0', word) # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
             if label.startswith("B-"):
                 num_entity +=1
     print("number of sentences: {}, number of entities: {}".format(len(insts), num_entity))
     return insts
Esempio n. 11
0
 def read_txt(self, file: str, number: int = -1) -> List[Instance]:
     insts = []
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             # 检测到空行,即句子间分割标志
             if line == "":
                 if len(words) == 0:
                     continue
                 insts.append(Instance(Sentence(words), labels))
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             word = line.split()[0]
             label = line.split()[1]
             if self.digit2zero:
                 word = re.sub('\d', '0', word)  # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
     print("number of sentences: {}".format(len(insts)))
     return insts
Esempio n. 12
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         mytokens = list()
         for tok in sent.rstrip().split("\n"):
             (
                 index,
                 word,
                 lemma,
                 upos,
                 xpos,
                 feats,
                 head,
                 deprel,
                 deps,
                 misc,
             ) = tok.split("\t")
             mytokens.append(
                 Token(
                     id=index,
                     word=word,
                     lemma=lemma,
                     # don't write out gold pos
                     # upos=upos, xpos=xpos,
                     feats=str(Morph.from_parzu(xpos + "|" + feats)),
                     head=head,
                     deprel=deprel,
                     deps=deps,
                     misc=misc,
                 ))
         self.data.append(Sentence(mytokens))
Esempio n. 13
0
def three_class_data_iter(which, n=None):
    assert which in ["dev", "test", "train"]

    f = base_path / f"snli_1.0_{which}.jsonl"

    with f.open() as i:
        yielded = 0
        for line in i:
            if n is not None and yielded >= n:
                break
            loaded = json.loads(line)
            if loaded["gold_label"] == "-":
                continue
            yield ((Sentence(loaded["sentence1"], loaded["sentence1_parse"]),
                    Sentence(loaded["sentence2"],
                             loaded["sentence2_parse"])), loaded["gold_label"])
            yielded += 1
Esempio n. 14
0
 def postprocess(self):
     self.data = list()
     for sent_doc in self.output_data:
         self.data.append(
             Sentence(
                 Token(word=str(tok),
                       xpos=tok.tag_,
                       upos=tok.pos_,
                       lemma=tok.lemma_) for tok in sent_doc))
Esempio n. 15
0
    def distance(self, s1: Sentence, s2: Sentence) -> float:
        B = nx.Graph()

        s1_tokens = s1.lowercase_tokens()
        s2_tokens = s2.lowercase_tokens()

        top_nodes = [(0, idx) for idx in range(len(s1_tokens))]
        bottom_nodes = [(1, idx) for idx in range(len(s2_tokens))]
        B.add_nodes_from(top_nodes, bipartite=0)
        B.add_nodes_from(bottom_nodes, bipartite=1)

        for idx1, t1 in enumerate(s1_tokens):
            for idx2, t2 in enumerate(s2_tokens):
                # use negative of similarity because we will do minimum weight matching
                B.add_edge((0, idx1), (1, idx2), weight=-self.sim(t1, t2))

        matching = nx.bipartite.matching.minimum_weight_full_matching(B)
        edges = [(v_from, v_to) for v_from, v_to in matching.items()
                 if v_from[0] == 0]
        sum_sim = sum(-B[v_from][v_to]["weight"] for v_from, v_to in edges)
        return [sum_sim / len(s2_tokens)]
Esempio n. 16
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         senttokens = list()
         for tok in sent.split("\n"):
             token, tag = tok.split("\t")
             stts = rftag2stts(tag)
             senttokens.append(
                 Token(word=token,
                       xpos=stts,
                       feats=str(Morph.from_rftag(tag))))
         self.data.append(Sentence(senttokens))
Esempio n. 17
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         self.data.append(
             Sentence(
                 Token(
                     id=str(rel.dep().index()),
                     word=rel.dep().word(),
                     # don't write out gold pos
                     #   xpos=rel.dep().tag(),
                     head=str(rel.gov().index()),
                     deprel=str(rel.reln()),
                 ) for rel in sent.typedDependencies()))
    def text_to_instances(self, tokens: List[str], annotations: List[Dict[str, Any]] = [], **metadata) -> Instance:
        metadata["og_tokens"] = tokens
        if self.subword_converter is not None:
            tokens, tokidx2bpeidxs = self.subword_converter(tokens)
        else:
            tokidx2bpeidxs = {i: [i] for i in range(len(tokens))}
        metadata["tokidx2bpeidxs"] = tokidx2bpeidxs
        tags = self.get_tags(tokens, annotations, metadata)

        for tokens, tags, metadata in self.as_maximal_subdocs(tokens, tags, metadata):
            inst = Instance(Sentence(tokens), tags)
            inst.metadata = metadata
            yield inst
Esempio n. 19
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.sentences:
         self.data.append(
             Sentence(
                 Token(
                     id=tok.index,
                     word=tok.text,
                     lemma=tok.lemma,
                     feats=tok.feats,
                     head=str(tok.governor),
                     deprel=tok.dependency_relation,
                 ) for tok in sent.words))
Esempio n. 20
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.sents:
         self.data.append(
             Sentence(
                 Token(
                     word=tok.text,
                     lemma=tok.lemma_,
                     # upos=tok.pos_,
                     #   xpos=tok.tag_,
                     head=str(tok.head.i - sent[0].i + 1),
                     deprel=tok.dep_,
                 ) for tok in sent))
Esempio n. 21
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data:
         mytokens = list()
         for tok in sent:
             text, rftmorph, stts, lemma = tok
             mytokens.append(
                 Token(
                     word=text,
                     xpos=stts,
                     feats=str(Morph.from_rftag(rftmorph)),
                     lemma=lemma,
                 ))
         self.data.append(Sentence(mytokens))
Esempio n. 22
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.rstrip().split("\n\n"):
         mytokens = list()
         for token_entry in sent.split("\n"):
             tok, tag, lemma = token_entry.split("\t")
             maintag = tag.split(".")[0]
             # kleine korrektur
             stts = "$." if maintag == "$" else maintag
             mytokens.append(
                 Token(
                     word=tok,
                     xpos=stts,
                     lemma=lemma,
                     feats=str(Morph.from_tigertag(tag)),
                 ))
         self.data.append(Sentence(mytokens))
Esempio n. 23
0
    def read_txt(
            self,
            file: str,
            number: int = -1
    ) -> List[Instance]:  # expected type -> return type
        count_0 = 0
        print("Reading file: " + file)
        insts = []
        with open(file, 'r', encoding='utf-8') as f:
            words = []
            labels = []
            for line in tqdm(f.readlines()):
                line = line.rstrip()
                if line == "":
                    assert len(words) == len(labels)
                    inst = Instance(Sentence(words), labels)
                    inst.set_id(len(insts))
                    insts.append(inst)
                    words = []
                    labels = []
                    if len(insts) == number:
                        break
                    continue

                #for
                x = line.split()
                if len(x) == 1:
                    word, label = '&', x[0]  # '&'
                elif len(x) == 2:
                    word, label = x[0], x[1]
                else:
                    print(x)

                if self.digit2zero:
                    word = re.sub(
                        '\d', '0', word
                    )  # replace all digits with 0. '\d' - unicode decimal digits [0-9]
                    count_0 += 1
                words.append(word)
                self.vocab.add(word)

                labels.append(label)
        print("numbers being replaced by zero:", count_0)
        print("number of sentences: {}".format(len(insts)))
        return insts
    def read_txt(self, file: str, number: int = 5) -> List[Instance]:
        print("Reading file: " + file)
        insts = []

        # f_vec = open(file[:8]+'vec_test.pkl', 'rb')
        f_vec = open(file[:9] + 'vec_' + file[9:-4] + '.pkl', 'rb')
        print(file[:8] + 'vec_' + file[8:-4] + '.pkl')
        all_vecs = pickle.load(f_vec)
        f_vec.close

        with open(file, 'r', encoding='utf-8') as f:

            sents = []
            ori_sents = []
            labels = []
            types = []
            sent_idx = 0
            review_idx = []
            reply_idx = []
            labels_pair = []
            max_review_id = 0
            new_index = 0

            f = f.readlines()
            count_review = 0
            count_reply = 0
            argu_sent_review = 0
            argu_sent_reply = 0
            argu_review = 0
            argu_reply = 0

            for line_idx, line in enumerate(tqdm(f)):
                line = line.rstrip()
                if line == "":
                    new_index = 0
                    vecs = all_vecs[len(insts)]
                    # max_num_tokens = len(vecs[0])
                    num_tokens = [len(vecs[i]) for i in range(len(vecs))]
                    inst = Instance(Sentence(sents, ori_sents), labels, vecs,
                                    types, review_idx, reply_idx, labels_pair,
                                    max_review_id, num_tokens)
                    ##read vector
                    # print(review_idx,reply_idx,max_review_id,labels_pair)
                    insts.append(inst)
                    sents = []
                    ori_sents = []
                    labels = []
                    types = []
                    sent_idx = 0
                    review_idx = []
                    reply_idx = []
                    labels_pair = []
                    max_review_id = 0
                    if len(insts) == number:
                        break
                    continue
                ls = line.split('\t')
                if ls[1] == 'O':
                    sent, label, label_pair, type = ls[0], ls[1], 0, ls[-2]
                else:
                    sent, label, label_pair, type = ls[
                        0], ls[1][:2] + '0', int(ls[2][2:]), ls[-2]

                ori_sents.append(sent)
                if type == 'Review':
                    count_review += 1
                    type_id = 0
                    if label[0] != 'O':
                        review_idx.append(sent_idx)
                        argu_sent_review += 1
                    if label[0] == 'B':
                        argu_review += 1
                    # else:
                    #     review_idx.append(0)
                    max_review_id += 1
                else:
                    type_id = 1
                    count_reply += 1
                    reply_idx.append(sent_idx)
                    if label[0] != 'O':
                        argu_sent_reply += 1
                    if label[0] == 'B':
                        argu_reply += 1

                types.append(type_id)

                sent_idx += 1
                new_index += 1

                # if self.digit2zero:
                #     sent = re.sub('\d', '0', sent) # replace digit with 0.
                sents.append(sent)
                self.vocab.add(sent)

                # bc = BertClient()
                # vec = bc.encode([sent])
                # vecs.append(vec[0][0])

                labels.append(label)
                labels_pair.append(label_pair)
        print(
            'review, reply, review_argu, reply_argu, review_sent_argu, reply_sent_argu',
            count_review, count_reply, argu_review, argu_reply,
            argu_sent_review, argu_sent_reply)
        print("number of sentences: {}".format(len(insts)))
        all_vecs = 0
        vecs = 0
        return insts
 def distance(self, s1: Sentence, s2: Sentence) -> float:
     return [simple_distance(s1.tree(), s2.tree())]
Esempio n. 26
0
 def postprocess(self):
     """re-format output_data so that it conforms to eval format"""
     self.data = list()
     for sent in self.output_data:
         self.data.append(
             Sentence(Token(word=tok, xpos=tag) for tok, tag in sent))
Esempio n. 27
0
 def postprocess(self):
     self.data = list()
     for sent in self.output_data.sentence:
         self.data.append(
             Sentence(
                 Token(word=tok.word, xpos=tok.pos) for tok in sent.token))
Esempio n. 28
0
 def sents_to_insts(self, sentences: List[str]) -> List[Instance]:
     insts = []
     for sentence in sentences:
         words = sentence.split()
         insts.append(Instance(Sentence(words)))
     return insts
Esempio n. 29
0
 def sent_to_insts(self, sentence: str) -> List[Instance]:
     words = sentence.split()
     return[Instance(Sentence(words))]
Esempio n. 30
0
    def read_trigger_txt(self, file: str, number: int = -1) -> List[Instance]:
        label_vocab = dict()
        print("Reading file: " + file)
        insts = []
        max_length = 0
        with open(file, 'r', encoding='utf-8') as f:
            words = []
            labels = []
            word_index = 0
            for line in tqdm(f.readlines()):
                line = line.rstrip()
                if line == "":
                    # check the sequence of index to find entity which consists of multiple words
                    entity_dict = dict()
                    for ent in labels:
                        if ent[0].startswith("B-") or ent[0].startswith(
                                "I-") or ent[0].startswith("T-"):
                            if ent[0].split("-")[1] not in entity_dict:
                                entity_dict[ent[0].split("-")[1]] = [[
                                    words[ent[1]], ent[1]
                                ]]
                            else:
                                entity_dict[ent[0].split("-")[1]].append(
                                    [words[ent[1]], ent[1]])

                    # entity word, index, type
                    trigger_positions = []
                    trigger_keys = []
                    for key in entity_dict:
                        if key in [
                                '0', '1', '2', '3', '4', '5', '6', '7', '8',
                                '9'
                        ]:
                            trigger_positions.append(
                                [i[1] for i in entity_dict[key]])
                            trigger_keys.append(" ".join(
                                i[0] for i in entity_dict[key]))
                        else:
                            if key not in label_vocab:
                                label_vocab[key] = len(label_vocab)
                            trigger_label = label_vocab[key]

                    final_labels = []
                    for label in labels:
                        if label[0].startswith("T"):
                            final_labels.append("O")
                        else:
                            final_labels.append(label[0])

                    for trigger_position, trigger_key in zip(
                            trigger_positions, trigger_keys):
                        insts.append(
                            Instance(Sentence(words), final_labels, None,
                                     trigger_label, trigger_position,
                                     trigger_key))

                    #insts.append(Instance(Sentence(words), labels, None, trigger_label, trigger_positions))
                    if len(words) > max_length:
                        max_length = len(words)
                    words = []
                    labels = []
                    word_index = 0
                    if len(insts) == number:
                        break
                    continue
                word, label = line.split()

                if self.digit2zero:
                    word = re.sub('\d', '0', word)  # replace digit with 0.
                words.append(word)
                self.vocab.add(word)
                labels.append([label, word_index])
                word_index += 1
        print("number of sentences: {}".format(len(insts)))
        return insts, max_length, len(label_vocab)