Esempio n. 1
0
 def read_txt(self, file: str, number: int = -1) -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 inst = Instance(Sentence(words), labels)
                 inst.set_id(len(insts))
                 insts.append(inst)
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             word, label = line.split('<|>')
             if self.digit2zero:
                 word = re.sub('\d', '0', word)  # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
     print("number of sentences: {}".format(len(insts)))
     return insts
def read_extraction_results(file: str,
                            number: int = -1,
                            digit2zero: bool = True) -> List[Instance]:
    print("Reading file: " + file)
    insts = []
    with open(file, 'r', encoding='utf-8') as f:
        words = []
        labels = []
        ground_truth = []
        for line in tqdm(f.readlines()):
            line = line.rstrip()
            if line == "":
                inst = Instance(Sentence(words), labels)
                inst.ground_truth = ground_truth
                insts.append(inst)
                words = []
                labels = []
                ground_truth = []
                if len(insts) == number:
                    break
                continue
            _, word, gold_label, predicted_segment_label = line.split()
            if digit2zero:
                word = re.sub('\d', '0', word)  # replace digit with 0.
            words.append(word)
            labels.append(predicted_segment_label)
            ground_truth.append(gold_label)
    print("number of sentences: {}".format(len(insts)))
    return insts
    def text_to_instances(self, tokens: List[str], annotations: List[Dict[str, Any]] = [], **metadata) -> Instance:
        metadata["og_tokens"] = tokens
        if self.subword_converter is not None:
            tokens, tokidx2bpeidxs = self.subword_converter(tokens)
        else:
            tokidx2bpeidxs = {i: [i] for i in range(len(tokens))}
        metadata["tokidx2bpeidxs"] = tokidx2bpeidxs
        tags = self.get_tags(tokens, annotations, metadata)

        for tokens, tags, metadata in self.as_maximal_subdocs(tokens, tags, metadata):
            inst = Instance(Sentence(tokens), tags)
            inst.metadata = metadata
            yield inst
def read_txt(file: str, number: int = -1):

    insts = []

    digit2zero = False
    insts = []
    sentences = []
    predictions = []

    sentence = ''
    sections = []

    prediction = []
    true_label = []

    with open(file, 'r', encoding='utf-8') as f:

        for line in f:
            line = line.strip()
            if line == '':
                if sentence.strip() != '':
                    sentences.append(' '.join(sentence.split(' ')[1:]))
                    sections.append(prediction[0])
                    predictions.append(prediction[1:])
                    inst = Instance(input=Sentence(words=sentence.split()[1:],
                                                   heading=prediction[0]))
                    inst.output = true_label
                    inst.prediction = prediction[1:]
                    insts.append(inst)

                    # print(len(sentence.split()[1:]))
                    # print(len(label[1:]))
                    # print(len(output))

                sentence = ''
                prediction = []
                true_label = []
            else:
                if digit2zero:
                    sentence += re.sub('\d', '0', line.split()[0]) + ' '
                else:
                    sentence += line.split()[0] + ' '
                prediction.append(line.split()[1])
                if len(line.split()) == 3:
                    true_label.append(line.split()[2])

        if sentence.strip() != '':
            sentences.append(' '.join(sentence.split(' ')[1:]))
            sections.append(prediction[0])
            predictions.append(prediction[1:])
            inst = Instance(input=Sentence(words=sentence.split()[1:],
                                           heading=prediction[0]))
            inst.output = true_label
            inst.prediction = prediction[1:]
            insts.append(inst)
            # print(len(sentence.split()[1:]))
            # print(len(label[1:]))
            # print(len(output))

    return sentences, sections, predictions, insts
Esempio n. 5
0
 def read_txt(self,
              file: str,
              number: int = -1,
              category: str = "all") -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         ori_words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 if category == "all" or words[0] == category:
                     insts.append(
                         Instance(Sentence(words[1:], ori_words[1:]),
                                  labels[1:]))
                 words = []
                 ori_words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             ls = line.split()
             word, label = ls[0], ls[-1]
             if len(ls) == 1:
                 label = "O"
             ori_words.append(word)
             if self.digit2zero:
                 word = re.sub('\d', '0', word)  # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
     print("number of sentences: {}".format(len(insts)))
     return insts
Esempio n. 6
0
    def read_txt_with_extraction(self, file: str, extraction_file: str, number: int = -1) -> List[Instance]:
        print("Reading file: " + file)
        print("Reading file: " + extraction_file)
        f_extract = open(extraction_file, 'r', encoding='utf-8')
        extract_lines = f_extract.readlines()
        i = -1
        insts = []
        with open(file, 'r', encoding='utf-8') as f:
            words = []
            labels = []
            boundaries = []
            for line in tqdm(f.readlines()):
                i += 1
                extract_line = extract_lines[i]
                extract_line = extract_line.rstrip()
                line = line.rstrip()
                if line == "":
                    insts.append(Instance(Sentence(words), labels, boundaries))
                    words = []
                    labels = []
                    boundaries = []
                    if len(insts) == number:
                        break
                    continue
                word, label = line.split()
                _, word_, gold_, predicted_label = extract_line.split()
                if self.digit2zero:
                    word = re.sub('\d', '0', word) # replace digit with 0.
                words.append(word)
                self.vocab.add(word)
                labels.append(label)
                boundaries.append(predicted_label)

        print("number of sentences: {}".format(len(insts)))
        return insts
Esempio n. 7
0
 def read_conll(self, file: str, number: int = -1, is_train: bool = True) -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     num_entity = 0
     # vocab = set() ## build the vocabulary
     find_root = False
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 insts.append(Instance(Sentence(words), labels))
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             vals = line.split()
             word = vals[1]
             label = vals[10]
             if self.digit2zero:
                 word = re.sub('\d', '0', word) # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
             if label.startswith("B-"):
                 num_entity +=1
     print("number of sentences: {}, number of entities: {}".format(len(insts), num_entity))
     return insts
Esempio n. 8
0
 def read_txt(self, file: str, number: int = -1) -> List[Instance]:
     insts = []
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             # 检测到空行,即句子间分割标志
             if line == "":
                 if len(words) == 0:
                     continue
                 insts.append(Instance(Sentence(words), labels))
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             word = line.split()[0]
             label = line.split()[1]
             if self.digit2zero:
                 word = re.sub('\d', '0', word)  # replace digit with 0.
             words.append(word)
             self.vocab.add(word)
             labels.append(label)
     print("number of sentences: {}".format(len(insts)))
     return insts
Esempio n. 9
0
    def read_txt(
            self,
            file: str,
            number: int = -1
    ) -> List[Instance]:  # expected type -> return type
        count_0 = 0
        print("Reading file: " + file)
        insts = []
        with open(file, 'r', encoding='utf-8') as f:
            words = []
            labels = []
            for line in tqdm(f.readlines()):
                line = line.rstrip()
                if line == "":
                    assert len(words) == len(labels)
                    inst = Instance(Sentence(words), labels)
                    inst.set_id(len(insts))
                    insts.append(inst)
                    words = []
                    labels = []
                    if len(insts) == number:
                        break
                    continue

                #for
                x = line.split()
                if len(x) == 1:
                    word, label = '&', x[0]  # '&'
                elif len(x) == 2:
                    word, label = x[0], x[1]
                else:
                    print(x)

                if self.digit2zero:
                    word = re.sub(
                        '\d', '0', word
                    )  # replace all digits with 0. '\d' - unicode decimal digits [0-9]
                    count_0 += 1
                words.append(word)
                self.vocab.add(word)

                labels.append(label)
        print("numbers being replaced by zero:", count_0)
        print("number of sentences: {}".format(len(insts)))
        return insts
def generate_instances(
        prefix: str, seed: int, instances_per_size: int, job_counts: List[int],
        machine_counts: List[int],
        duration_distributions: List[str]) -> Generator[Instance, None, None]:
    count = instances_per_size * len(job_counts) * len(machine_counts) * len(
        duration_distributions)
    num_len = len(str(count + 1))
    num = 0
    for duration_distribution in duration_distributions:
        for jobs in job_counts:
            for machines in machine_counts:
                for _ in range(0, instances_per_size):
                    num += 1
                    name = f'{prefix}{num:0{num_len}d}_s{seed}_j{jobs}_m{machines}_d{duration_distribution}'
                    data = generate_instance(
                        seed=seed + num,
                        n_jobs=jobs,
                        n_machines=machines,
                        duration_distribution=DISTRIBUTIONS[
                            duration_distribution])
                    yield Instance(name=name,
                                   n_jobs=jobs,
                                   n_machines=machines,
                                   data=data)
Esempio n. 11
0
 def sents_to_insts(self, sentences: List[str]) -> List[Instance]:
     insts = []
     for sentence in sentences:
         words = sentence.split()
         insts.append(Instance(Sentence(words)))
     return insts
Esempio n. 12
0
 def sent_to_insts(self, sentence: str) -> List[Instance]:
     words = sentence.split()
     return[Instance(Sentence(words))]
Esempio n. 13
0
    def read_trigger_txt(self, file: str, number: int = -1) -> List[Instance]:
        label_vocab = dict()
        print("Reading file: " + file)
        insts = []
        max_length = 0
        with open(file, 'r', encoding='utf-8') as f:
            words = []
            labels = []
            word_index = 0
            for line in tqdm(f.readlines()):
                line = line.rstrip()
                if line == "":
                    # check the sequence of index to find entity which consists of multiple words
                    entity_dict = dict()
                    for ent in labels:
                        if ent[0].startswith("B-") or ent[0].startswith(
                                "I-") or ent[0].startswith("T-"):
                            if ent[0].split("-")[1] not in entity_dict:
                                entity_dict[ent[0].split("-")[1]] = [[
                                    words[ent[1]], ent[1]
                                ]]
                            else:
                                entity_dict[ent[0].split("-")[1]].append(
                                    [words[ent[1]], ent[1]])

                    # entity word, index, type
                    trigger_positions = []
                    trigger_keys = []
                    for key in entity_dict:
                        if key in [
                                '0', '1', '2', '3', '4', '5', '6', '7', '8',
                                '9'
                        ]:
                            trigger_positions.append(
                                [i[1] for i in entity_dict[key]])
                            trigger_keys.append(" ".join(
                                i[0] for i in entity_dict[key]))
                        else:
                            if key not in label_vocab:
                                label_vocab[key] = len(label_vocab)
                            trigger_label = label_vocab[key]

                    final_labels = []
                    for label in labels:
                        if label[0].startswith("T"):
                            final_labels.append("O")
                        else:
                            final_labels.append(label[0])

                    for trigger_position, trigger_key in zip(
                            trigger_positions, trigger_keys):
                        insts.append(
                            Instance(Sentence(words), final_labels, None,
                                     trigger_label, trigger_position,
                                     trigger_key))

                    #insts.append(Instance(Sentence(words), labels, None, trigger_label, trigger_positions))
                    if len(words) > max_length:
                        max_length = len(words)
                    words = []
                    labels = []
                    word_index = 0
                    if len(insts) == number:
                        break
                    continue
                word, label = line.split()

                if self.digit2zero:
                    word = re.sub('\d', '0', word)  # replace digit with 0.
                words.append(word)
                self.vocab.add(word)
                labels.append([label, word_index])
                word_index += 1
        print("number of sentences: {}".format(len(insts)))
        return insts, max_length, len(label_vocab)
 def add_instance(self, name, libRef, cellRef, viewRef):
     self.instances.update(
             {
                 name: Instance(libRef, cellRef, viewRef)
                 }
             )
    def read_txt(self, file: str, number: int = 5) -> List[Instance]:
        print("Reading file: " + file)
        insts = []

        # f_vec = open(file[:8]+'vec_test.pkl', 'rb')
        f_vec = open(file[:9] + 'vec_' + file[9:-4] + '.pkl', 'rb')
        print(file[:8] + 'vec_' + file[8:-4] + '.pkl')
        all_vecs = pickle.load(f_vec)
        f_vec.close

        with open(file, 'r', encoding='utf-8') as f:

            sents = []
            ori_sents = []
            labels = []
            types = []
            sent_idx = 0
            review_idx = []
            reply_idx = []
            labels_pair = []
            max_review_id = 0
            new_index = 0

            f = f.readlines()
            count_review = 0
            count_reply = 0
            argu_sent_review = 0
            argu_sent_reply = 0
            argu_review = 0
            argu_reply = 0

            for line_idx, line in enumerate(tqdm(f)):
                line = line.rstrip()
                if line == "":
                    new_index = 0
                    vecs = all_vecs[len(insts)]
                    # max_num_tokens = len(vecs[0])
                    num_tokens = [len(vecs[i]) for i in range(len(vecs))]
                    inst = Instance(Sentence(sents, ori_sents), labels, vecs,
                                    types, review_idx, reply_idx, labels_pair,
                                    max_review_id, num_tokens)
                    ##read vector
                    # print(review_idx,reply_idx,max_review_id,labels_pair)
                    insts.append(inst)
                    sents = []
                    ori_sents = []
                    labels = []
                    types = []
                    sent_idx = 0
                    review_idx = []
                    reply_idx = []
                    labels_pair = []
                    max_review_id = 0
                    if len(insts) == number:
                        break
                    continue
                ls = line.split('\t')
                if ls[1] == 'O':
                    sent, label, label_pair, type = ls[0], ls[1], 0, ls[-2]
                else:
                    sent, label, label_pair, type = ls[
                        0], ls[1][:2] + '0', int(ls[2][2:]), ls[-2]

                ori_sents.append(sent)
                if type == 'Review':
                    count_review += 1
                    type_id = 0
                    if label[0] != 'O':
                        review_idx.append(sent_idx)
                        argu_sent_review += 1
                    if label[0] == 'B':
                        argu_review += 1
                    # else:
                    #     review_idx.append(0)
                    max_review_id += 1
                else:
                    type_id = 1
                    count_reply += 1
                    reply_idx.append(sent_idx)
                    if label[0] != 'O':
                        argu_sent_reply += 1
                    if label[0] == 'B':
                        argu_reply += 1

                types.append(type_id)

                sent_idx += 1
                new_index += 1

                # if self.digit2zero:
                #     sent = re.sub('\d', '0', sent) # replace digit with 0.
                sents.append(sent)
                self.vocab.add(sent)

                # bc = BertClient()
                # vec = bc.encode([sent])
                # vecs.append(vec[0][0])

                labels.append(label)
                labels_pair.append(label_pair)
        print(
            'review, reply, review_argu, reply_argu, review_sent_argu, reply_sent_argu',
            count_review, count_reply, argu_review, argu_reply,
            argu_sent_review, argu_sent_reply)
        print("number of sentences: {}".format(len(insts)))
        all_vecs = 0
        vecs = 0
        return insts