Ejemplo n.º 1
0
    def __extra_examples_from_org_file__(self, org_file, des_filename):
        if file_tool.check_file(des_filename):
            examples_dicts = file_tool.load_data_pickle(des_filename)
            return examples_dicts
        example_dicts = []
        rows = file_tool.read_tsv(org_file)
        for i, row in enumerate(rows):
            if i == 0:
                continue
            if i == 0:
                continue

            if len(row) != 6:
                raise RuntimeError

            example_temp = {
                'qes_id1': int(row[1]),
                'qes1': str(row[3]).strip(),
                'qes_id2': int(row[2]),
                'qes2': str(row[4]).strip(),
                'label': int(row[5]),
                'id': int(row[0]),
            }
            example_dicts.append(example_temp)

        file_tool.save_data_pickle(example_dicts, des_filename)
        return example_dicts
Ejemplo n.º 2
0
    def __extra_sentences_from_org_file__(self, org_file, des_filename):
        if file_tool.check_file(des_filename):
            sentence_dict = file_tool.load_data_pickle(des_filename)
            return sentence_dict
        sentence_dict = {}
        rows = file_tool.load_data(org_file, mode='r')
        for i, row in enumerate(rows):
            result = row.split("\t")
            if i == 0:
                continue
            if len(result) != 6:
                raise RuntimeError

            if not general_tool.is_number(result[1]):
                raise RuntimeError

            if not general_tool.is_number(result[2]):
                raise RuntimeError

            if str(result[3]).strip() == '':
                print('empty sentence id:{}'.format(str(result[1]).strip()))
            else:
                sentence_dict[str(result[1]).strip()] = str(result[3]).strip()

            if str(result[4]).strip() == '':
                print('empty sentence id:{}'.format(str(result[2]).strip()))
                continue
            else:
                sentence_dict[str(result[2]).strip()] = str(result[4]).strip()

        file_tool.save_data_pickle(sentence_dict, des_filename)
        return sentence_dict
Ejemplo n.º 3
0
def show_sent_len_distribute():
    sent_len_table = file_tool.load_data_pickle(
        file_tool.connect_path(Qqp.data_path, 'sent_len_table.pkl'))
    plt.bar(range(1, len(sent_len_table) + 1), sent_len_table)
    plt.title("")
    plt.xlabel('sentence length')
    plt.ylabel('count')
    plt.xlim(0, 80)
    plt.show()
Ejemplo n.º 4
0
def get_qqp_obj(force=False):

    global single_qqp_obj
    if force or (single_qqp_obj is None):
        single_qqp_obj_file = file_tool.connect_path("corpus/qqp",
                                                     'qqp_obj.pkl')
        if file_tool.check_file(single_qqp_obj_file):
            single_qqp_obj = file_tool.load_data_pickle(single_qqp_obj_file)
        else:
            single_qqp_obj = Qqp()
            file_tool.save_data_pickle(single_qqp_obj, single_qqp_obj_file)

    return single_qqp_obj
Ejemplo n.º 5
0
    def parse_sentences(self):
        parsed_sentence_org_file = 'corpus/mrpc/parsed_sentences.txt'
        parsed_sentence_dict_file = 'corpus/mrpc/parsed_sentence_dict.pkl'
        if file_tool.check_file(parsed_sentence_dict_file):
            parsed_sentence_dict = file_tool.load_data_pickle(
                parsed_sentence_dict_file)
        else:
            parsed_sentence_dict = parser_tool.extra_parsed_sentence_dict_from_org_file(
                parsed_sentence_org_file)
            file_tool.save_data_pickle(parsed_sentence_dict,
                                       parsed_sentence_dict_file)

        if len(parsed_sentence_dict) != len(self.sentence_dict):
            # raise ValueError("parsed_sentence_dict not march sentence_dict")
            pass

        if not general_tool.compare_two_dict_keys(self.sentence_dict.copy(),
                                                  parsed_sentence_dict.copy()):
            raise ValueError("parsed_sentence_dict not march sentence_dict")

        # for sent_id, info in parsed_sentence_dict.items():
        #     if info['original'] != self.sentence_dict[sent_id].original:
        #         raise ValueError("parsed_sentence_dict not march sentence_dict")

        for sent_id, parse_info in parsed_sentence_dict.items():
            sent_id = str(sent_id)
            self.sentence_dict[sent_id].parse_info = parse_info

        self.parse_info = parser_tool.process_parsing_sentence_dict(
            parsed_sentence_dict, modify_dep_name=True)
        numeral_sentence_dict = self.parse_info.numeral_sentence_dict

        if not general_tool.compare_two_dict_keys(
                self.sentence_dict.copy(), numeral_sentence_dict.copy()):
            raise ValueError("numeral_sentence_dict not march sentence_dict")

        for sent_id in self.sentence_dict.keys():
            self.sentence_dict[sent_id].syntax_info = numeral_sentence_dict[
                sent_id]

        # print('the count of dep type:{}'.format(self.parse_info.dependency_count))
        # print('the max len of sentence_tokens:{}'.format(self.parse_info.max_sent_len))

        pass
Ejemplo n.º 6
0
    def parse_sentences(self):
        parsed_sentence_org_file = file_tool.connect_path(
            self.data_path, 'parsed_sentences.txt')
        parsed_sentence_dict_file = file_tool.connect_path(
            self.data_path, 'parsed_sentence_dict.pkl')
        if file_tool.check_file(parsed_sentence_dict_file):
            parsed_sentence_dict = file_tool.load_data_pickle(
                parsed_sentence_dict_file)
        else:
            parsed_sentence_dict = parser_tool.extra_parsed_sentence_dict_from_org_file(
                parsed_sentence_org_file)
            file_tool.save_data_pickle(parsed_sentence_dict,
                                       parsed_sentence_dict_file)

        if len(parsed_sentence_dict) != len(self.sentence_dict):
            raise ValueError("parsed_sentence_dict not march sentence_dict")

        if not general_tool.compare_two_dict_keys(self.sentence_dict.copy(),
                                                  parsed_sentence_dict.copy()):
            raise ValueError("parsed_sentence_dict not march sentence_dict")

        for sent_id, info in parsed_sentence_dict.items():
            if info['original'] != self.sentence_dict[sent_id].original:
                raise ValueError(
                    "parsed_sentence_dict not march sentence_dict")

        for sent_id, parse_info in parsed_sentence_dict.items():
            sent_id = str(sent_id)
            self.sentence_dict[sent_id].parse_info = parse_info

        self.parse_info = parser_tool.process_parsing_sentence_dict(
            parsed_sentence_dict, modify_dep_name=True)
        numeral_sentence_dict = self.parse_info.numeral_sentence_dict
        self.max_sent_len = self.parse_info.max_sent_len

        if not general_tool.compare_two_dict_keys(
                self.sentence_dict.copy(), numeral_sentence_dict.copy()):
            raise ValueError("numeral_sentence_dict not march sentence_dict")

        for sent_id in self.sentence_dict.keys():
            self.sentence_dict[sent_id].syntax_info = numeral_sentence_dict[
                sent_id]
        pass
Ejemplo n.º 7
0
 def __extra_examples_from_org_file__(self, org_file, des_filename):
     if file_tool.check_file(des_filename):
         examples_dicts = file_tool.load_data_pickle(des_filename)
         return examples_dicts
     example_dicts = []
     rows = file_tool.load_data(org_file, mode='r')
     examples_id = 0
     for i, row in enumerate(rows):
         result = row.split("\t")
         if i == 0:
             continue
         if len(result) != 5:
             raise RuntimeError
         example_temp = {
             'sent_id1': int(result[1]),
             'sent_id2': int(result[2]),
             'label': int(result[0]),
             'id': examples_id
         }
         example_dicts.append(example_temp)
         examples_id += 1
     file_tool.save_data_pickle(example_dicts, des_filename)
     return example_dicts
Ejemplo n.º 8
0
    def __extra_sentences_from_org_file__(self, org_file, des_filename):
        if file_tool.check_file(des_filename):
            sentence_dict = file_tool.load_data_pickle(des_filename)
            return sentence_dict
        sentence_dict = {}
        rows = file_tool.load_data(org_file, mode='r')
        for i, row in enumerate(rows):
            result = row.split("\t")
            if i == 0:
                continue
            if len(result) != 7:
                raise RuntimeError

            if not general_tool.is_number(result[0]):
                raise RuntimeError

            if str(result[0]) in sentence_dict:
                raise RuntimeError

            sentence_dict[str(result[0])] = str(result[1])

        file_tool.save_data_pickle(sentence_dict, des_filename)

        return sentence_dict