Ejemplo n.º 1
0
def extract_blocks(input_file, k_base):
    '''Extract block structure for each value in input file'''
    r_input = [r for r in F.read_input(input_file)]
    normalized_input = [
        F.remove_stop_words(F.normalize_str(v)) for v in r_input
    ]
    blocks = []
    for raw_terms, record in zip(r_input, normalized_input):
        blocks.append(build_blocks(record.split(), raw_terms.split(), k_base))
    return blocks
Ejemplo n.º 2
0
 def create_dataframe(self, kb_file):
     num_attributes = len(self.get_attributes())
     try:
         tree = ET.parse(kb_file)
     except ET.ParseError as error:
         print("Error reading KB file for Pandas Dataframe. Cause: " +
               error.msg)
         sys.exit(1)
     record = tree.getroot()
     data = {'segment': [], 'attribute': []}
     for segment in record:
         data['segment'].append(F.normalize_str(segment.text))
         data['attribute'].append(segment.tag)
     self.df = pd.DataFrame(data, columns=['segment', 'attribute', 'label'])
     le = preprocessing.LabelEncoder()
     self.df['label'] = le.fit_transform(self.df['attribute'])
     self.labels_dict = le.inverse_transform(range(0, num_attributes))
Ejemplo n.º 3
0
 def init_kb(self, kb_file):
     try:
         tree = ET.parse(kb_file)
     except ET.ParseError as error:
         print("Error reading KB file. Cause: " + error.msg)
         sys.exit(1)
     data = tree.getroot()
     for segment in data:
         attr = segment.tag
         text = F.normalize_str(segment.text)
         self.registers.add(text)
         if attr not in self.k_base:
             self.k_base[attr] = {}
         terms = text.split()
         for term in terms:
             if term not in self.k_base[attr]:
                 self.k_base[attr][term] = 0
             self.k_base[attr][term] += 1
Ejemplo n.º 4
0
    def init_kb(self, kb_file):
        '''Parse Knowledge Base and prepare it to extract the content-based features'''
        logger.info('Parsing knowledge base file...')
        data = F.read_k_base(kb_file)

        for item in data:
            attribute = item.tag
            value = F.remove_stop_words(F.normalize_str(item.text))

            # Check if a value contains only stop words
            if not value:
                continue

            terms = value.split()
            i = 0
            while i < len(terms) - 1:
                if terms[i] in self.co_occurrences:
                    if (terms[i + 1],
                            attribute) not in self.co_occurrences[terms[i]]:
                        self.co_occurrences[terms[i]].append(
                            (terms[i + 1], attribute))
                else:
                    self.co_occurrences[terms[i]] = []
                i += 1
            if terms[-1] not in self.co_occurrences:
                self.co_occurrences[terms[-1]] = []

            for term in terms:
                occurrence = Occurrence(term)
                if attribute in self.k_base:
                    if term not in [
                            obj.term for obj in self.k_base[attribute]
                    ]:
                        self.k_base[attribute].append(occurrence)
                    else:
                        occ = [
                            v for v in self.k_base[attribute] if v.term == term
                        ]
                        occ[0].frequency += 1
                else:
                    self.k_base[attribute] = [occurrence]
Ejemplo n.º 5
0
    def record_evaluation(reference_file, results_file, attributes):
        '''Compute evaluation metrics per Record'''
        step = 'Matching Step' if results_file in 'matching_results.xml' else 'Reinforcement Step'
        reference = F.read_input(reference_file)
        results = F.read_input(results_file)
        record_evaluation = []
        for res, ref in zip(results, reference):
            results_stats = {}
            reference_stats = {}
            right_answers = {}
            attr_evaluation = {}

            result_record = ET.fromstring('<record>'+res+'</record>')
            reference_record = ET.fromstring('<record>'+ref+'</record>')

            for reference_block in reference_record:
                if reference_block.tag not in reference_stats:
                    reference_stats[reference_block.tag] = len(
                        reference_block.text.split())
                else:
                    reference_stats[reference_block.tag] += len(
                        reference_block.text.split())

            for result_block in result_record:
                if result_block.tag is not 'none' and result_block.tag not in results_stats:
                    results_stats[result_block.tag] = len(
                        result_block.text.split())
                else:
                    results_stats[result_block.tag] += len(
                        result_block.text.split())

            for result_block in result_record:
                for reference_block in reference_record:
                    if F.normalize_str(result_block.text) in F.normalize_str(reference_block.text) and result_block.tag == reference_block.tag:
                        if result_block.tag not in right_answers:
                            right_answers[result_block.tag] = len(
                                result_block.text.split())
                        else:
                            right_answers[result_block.tag] += len(
                                result_block.text.split())
                        break

            for attr in attributes:
                if attr in results_stats and attr in reference_stats and attr in right_answers:
                    attr_evaluation[attr] = Metrics()
                    attr_evaluation[attr].precision = right_answers[attr] / \
                        results_stats[attr]
                    attr_evaluation[attr].recall = right_answers[attr] / \
                        reference_stats[attr]
                    attr_evaluation[attr].f_measure = (2*attr_evaluation[attr].precision*attr_evaluation[attr].recall)/(
                        attr_evaluation[attr].precision+attr_evaluation[attr].recall)
                elif attr in results_stats and attr not in reference_stats:
                    attr_evaluation[attr] = Metrics()

            record = Metrics()
            for attr in attr_evaluation:
                record.precision += attr_evaluation[attr].precision
                record.recall += attr_evaluation[attr].recall
                record.f_measure += attr_evaluation[attr].f_measure
            record.precision /= len(attr_evaluation)
            record.recall /= len(attr_evaluation)
            record.f_measure /= len(attr_evaluation)
            record_evaluation.append(record)

        precision = 0
        recall = 0
        f_measure = 0
        for record in record_evaluation:
            precision += record.precision
            recall += record.recall
            f_measure += record.f_measure
        precision /= len(results)
        recall /= len(results)
        f_measure /= len(results)

        print(
            '----------------------------------------------------------------------------')
        print('{0} - Results Evaluation Per Record'.format(step))
        print(
            '----------------------------------------------------------------------------')
        print('{:<20} {:<20} {:<18}'.format(
            'Precision', 'Recall', 'F-Measure'))
        print('{:<20} {:<20} {:<18}'.format(precision, recall, f_measure))
        print()
Ejemplo n.º 6
0
    def attribute_evaluation(reference_file, results_file, attributes):
        '''Compute evaluation metrics per Attribute'''
        step = 'Matching Step' if results_file in 'matching_results.xml' else 'Reinforcement Step'
        reference = F.read_input(reference_file)
        results = F.read_input(results_file)
        results_stats = {}
        reference_stats = {}
        right_answers = {}
        attr_evaluation = {}
        for attr in attributes:
            attr_evaluation[attr] = Metrics()

        for res, ref in zip(results, reference):
            result_record = ET.fromstring('<record>'+res+'</record>')
            reference_record = ET.fromstring('<record>'+ref+'</record>')

            for reference_block in reference_record:
                if reference_block.tag not in reference_stats:
                    reference_stats[reference_block.tag] = len(
                        reference_block.text.split())
                else:
                    reference_stats[reference_block.tag] += len(
                        reference_block.text.split())

            for result_block in result_record:
                if result_block.tag is not 'none' and result_block.tag not in results_stats:
                    results_stats[result_block.tag] = len(
                        result_block.text.split())
                else:
                    results_stats[result_block.tag] += len(
                        result_block.text.split())

            for result_block in result_record:
                for reference_block in reference_record:
                    if F.normalize_str(result_block.text) in F.normalize_str(reference_block.text) and result_block.tag == reference_block.tag:
                        if result_block.tag not in right_answers:
                            right_answers[result_block.tag] = len(
                                result_block.text.split())
                        else:
                            right_answers[result_block.tag] += len(
                                result_block.text.split())
                        break

        for attr in attributes:
            if attr in results_stats and attr in reference_stats and attr in right_answers:
                attr_evaluation[attr].precision = right_answers[attr] / \
                    results_stats[attr]
                attr_evaluation[attr].recall = right_answers[attr] / \
                    reference_stats[attr]
                attr_evaluation[attr].f_measure = (2*attr_evaluation[attr].precision*attr_evaluation[attr].recall)/(
                    attr_evaluation[attr].precision+attr_evaluation[attr].recall)

        print(
            '----------------------------------------------------------------------------')
        print('{0} - Results Evaluation Per Attribute'.format(step))
        print(
            '----------------------------------------------------------------------------')
        print('{:<15} {:<20} {:<20} {:<18}'.format(
            'Attribute', 'Precision', 'Recall', 'F-Measure'))
        for k, v in attr_evaluation.items():
            if v.f_measure > 0:
                print('{:<15} {:<20} {:<20} {:<18}'.format(
                    k, v.precision, v.recall, v.f_measure))
Ejemplo n.º 7
0
def build_blocks(record, k_base):
    segments = record.split(",")
    blocks_list = []
    for b in segments:
        blocks_list.append(Block(F.normalize_str(b), b))
    return blocks_list
Ejemplo n.º 8
0
def evaluate_results_per_record(results, reference_file, attributes):
    reference = F.read_file(reference_file)
    record_evaluation = []

    for result_record, ref in zip(results, reference):
        results_stats = {}
        reference_stats = {}
        right_answers = {}
        attr_evaluation = {}

        reference_record = ET.fromstring('<record>' + ref + '</record>')

        for reference_block in reference_record:
            if reference_block.tag not in reference_stats:
                reference_stats[reference_block.tag] = len(
                    reference_block.text.split())
            else:
                reference_stats[reference_block.tag] += len(
                    reference_block.text.split())

        for result_block in result_record:
            if result_block.attr != 'none' and result_block.attr not in results_stats:
                results_stats[result_block.attr] = len(
                    result_block.value.split())
            else:
                results_stats[result_block.attr] += len(
                    result_block.value.split())

        for result_block in result_record:
            for reference_block in reference_record:
                if result_block.value in F.normalize_str(
                        reference_block.text
                ) and result_block.attr == reference_block.tag:
                    if result_block.attr not in right_answers:
                        right_answers[result_block.attr] = len(
                            result_block.value.split())
                    else:
                        right_answers[result_block.attr] += len(
                            result_block.value.split())
                    break

        for attr in attributes:
            if attr in results_stats and attr in reference_stats and attr in right_answers:
                attr_evaluation[attr] = Metrics()
                attr_evaluation[
                    attr].precision = right_answers[attr] / results_stats[attr]
                attr_evaluation[
                    attr].recall = right_answers[attr] / reference_stats[attr]
                attr_evaluation[attr].calculate_f_measure()
            elif attr in results_stats and attr not in reference_stats:
                attr_evaluation[attr] = Metrics()

        record = Metrics()
        for attr in attr_evaluation:
            record.precision += attr_evaluation[attr].precision
            record.recall += attr_evaluation[attr].recall
            record.f_measure += attr_evaluation[attr].f_measure
        record.precision /= len(attr_evaluation)
        record.recall /= len(attr_evaluation)
        record.f_measure /= len(attr_evaluation)
        record_evaluation.append(record)

    final_metrics = Metrics()
    for record in record_evaluation:
        final_metrics.precision += record.precision
        final_metrics.recall += record.recall
        final_metrics.f_measure += record.f_measure
    final_metrics.precision /= len(record_evaluation)
    final_metrics.recall /= len(record_evaluation)
    final_metrics.f_measure /= len(record_evaluation)

    print('---------- Results Evaluation Per Record ----------')
    print('{:<20} {:<20} {:<18}'.format('Precision', 'Recall', 'F-Measure'))
    print('{:<20} {:<20} {:<18}'.format(final_metrics.precision,
                                        final_metrics.recall,
                                        final_metrics.f_measure))
    print()
Ejemplo n.º 9
0
def evaluate_results_per_attribute(results, reference_file, attributes):
    reference = F.read_file(reference_file)
    results_stats = {}
    reference_stats = {}
    right_answers = {}
    attr_evaluation = {}
    record_evaluation = []
    for attr in attributes:
        attr_evaluation[attr] = Metrics()

    for result_record, ref in zip(results, reference):
        reference_record = ET.fromstring('<record>' + ref + '</record>')

        for reference_block in reference_record:
            if reference_block.tag not in reference_stats:
                reference_stats[reference_block.tag] = len(
                    reference_block.text.split())
            else:
                reference_stats[reference_block.tag] += len(
                    reference_block.text.split())

        for result_block in result_record:
            if result_block.attr != 'none':
                if result_block.attr not in results_stats:
                    results_stats[result_block.attr] = len(
                        result_block.value.split())
                else:
                    results_stats[result_block.attr] += len(
                        result_block.value.split())

        for result_block in result_record:
            for reference_block in reference_record:
                if result_block.value in F.normalize_str(
                        reference_block.text
                ) and result_block.attr == reference_block.tag:
                    if result_block.attr not in right_answers:
                        right_answers[result_block.attr] = len(
                            result_block.value.split())
                    else:
                        right_answers[result_block.attr] += len(
                            result_block.value.split())
                    break

    for attr in attributes:
        if attr in results_stats and attr in reference_stats and attr in right_answers:
            attr_evaluation[
                attr].precision = right_answers[attr] / results_stats[attr]
            attr_evaluation[
                attr].recall = right_answers[attr] / reference_stats[attr]
            attr_evaluation[attr].calculate_f_measure()

    print()
    print('---------- Results Evaluation Per Attribute ----------')
    print('{:<15} {:<20} {:<20} {:<18}'.format('Attribute', 'Precision',
                                               'Recall', 'F-Measure'))

    total_metrics = Metrics()
    non_zero_attrs = 0
    for k, v in attr_evaluation.items():
        if v.f_measure > 0:
            print('{:<15} {:<20} {:<20} {:<18}'.format(k, v.precision,
                                                       v.recall, v.f_measure))
            total_metrics.precision += v.precision
            total_metrics.recall += v.recall
            total_metrics.f_measure += v.f_measure
            non_zero_attrs += 1

    total_metrics.precision /= non_zero_attrs
    total_metrics.recall /= non_zero_attrs
    total_metrics.f_measure /= non_zero_attrs
    print()
    print('{:<15} {:<20} {:<20} {:<18}'.format("Total",
                                               total_metrics.precision,
                                               total_metrics.recall,
                                               total_metrics.f_measure))
    print()