def find_initials(self):
        '''
        Tries to find name initials in the text (J., A., etc.)
        '''

        text_parts = self.text_doc.content.split(' ')
        for p in text_parts:

            if len(p) == 2 and p[0].isupper() and p[1] == '.':
                self.initials.add(p)
            elif p.endswith('.'):
                for letter in czech_letters:
                    if p.startswith(letter):
                        self.initials.add(p)
                        break

        self.initials = set(self.initials)

        for i in self.initials:
            rgx = re.compile('({})'.format(i.replace('.', '\.')))
            matches = re.finditer(rgx, self.text_doc.content)
            for m in matches:
                idr = DataRow.from_data(10, m.start() + 1, m.end(), i)
                idr.initial = True
                self.data_rows.add(idr)

        self.data_rows.sort()
Exemple #2
0
    def prepare_data_rows(self, train):
        for i in range(0, train.size()):
            data_row_temp = DataRow()
            class_value = train.get_output_as_integer_with_pos(i)
            example = train.get_example(i)
            example_feature_array = []
            for f_variable in range(0, self.n_variables):
                # print("The f_variable is :"+str(f_variable))
                # print("The example is :" + str(example))
                example_feature_array.append(train.get_example(f_variable))

            label_array = []
            for m in range(0, self.n_variables):
                max_value = 0.0
                etq = -1
                per = None
                n_labels = self.data_base.num_labels(m)
                print("n_labels: " + str(n_labels))
                for n in range(0, n_labels):
                    # print("Inside the second loop of searchForBestAntecedent......")
                    print("example[" + str(m) + ")]: " + str(example[m]))
                    per = self.data_base.membership_function(m, n, example[m])
                    print("per: " + str(per))
                    if per > max_value:
                        max_value = per
                        etq = n
                if max_value == 0.0:
                    print(
                        "There was an Error while searching for the antecedent of the rule"
                    )
                    # print("Example: ")
                    for n in range(0, self.n_variables):
                        # print(str(example[n]) + "\t")
                        pass

                    print("Variable " + str(m))
                    exit(1)
                # print(" The max_value is : " + str(max_value))
                # print(" ,the j value is : " + str(j))

                label_array.append(etq)

            data_row_temp.set_three_parameters(class_value,
                                               example_feature_array,
                                               label_array)
            self.data_row_array.append(data_row_temp)
Exemple #3
0
 def fetchone(self, sql=None, params=()):
     if sql:
         self.execute(sql, params)
     rs = self._cursor.fetchone()
     if rs:
         desc = self._cursor.description
         column_map = dict((d[0], i) for i, d in enumerate(desc))
         return DataRow(rs, column_map)
     return rs
    def find_incomplete_names(self):
        '''
        Search for unknown words after known words.
        '''

        chars = ['\'', '.']
        isep = ['.', ' ']  # separators of initals, e.g. "Aaaa A.A."
        rows = self.data_rows

        for i in range(rows.size()):
            if rows[i].processed:
                continue
            else:
                next_name = rows[i + 1] if i < rows.size() - 1 else None
                if not rows.has_follower(i):
                    start = rows[i].start_offset
                    word = rows[i].value + ' '
                    word_start = False
                    position = rows[i].end_offset + 1
                    if position >= self.text_doc.size():
                        break  # Nothing else to be found
                    if not self.text_doc.content[position].isupper():
                        continue
                    else:
                        tolerance = position + self.tolerance
                        char = self.text_doc.content[position]
                        while (char.isalpha() or char.isspace() or char in chars) and \
                        (position < tolerance or char != ' ') and (char != '\n'):
                            if word_start and not char.isupper():
                                break  # Not name, end loop
                            else:
                                word_start = False
                            if char == ' ':
                                word_start = True
                            if char == '.':
                                if not ((word[-2] == ' ' or word[-2] == '.') and \
                                word[-1].isupper()):
                                    break  # Not initial
                            word += char
                            position += 1
                            char = self.text_doc.content[position]
                        words = word.strip().split(' ')
                        words = [x for x in words if x]
                        name = []
                        for w in words:
                            if w not in name:
                                name.append(w)
                            else:
                                break
                        name = ' '.join(name)
                        end = start + len(name) - 1
                        self.name_list.add(DataRow.from_data(1, start, \
                            end, name))

        self.name_list.sort()
        self.solve_conflicts()
    def find_adjacent_names(self):
        '''
        Loops through data rows and attempts to find names.
        '''

        #if self.data_rows.empty():
        #    print('No data have beed read!')
        #    sys.exit(1)

        rows = self.data_rows
        for i in range(rows.size()):
            if len(rows[i].parts) != 1: continue
            if not rows[i].value[0].isupper():
                czech = False
                for letter in czech_letters:
                    if rows[i].value.startswith(letter):
                        czech = True
                        break
                if not czech:
                    continue

            parts = DataRowList.from_string(rows[i])
            if rows[i].processed:
                continue
            else:
                for j in range(i + 1, rows.size()):
                    if parts[-1].end_offset == rows[j].start_offset - 2 and \
                    (not parts.contains(rows[j].value) or rows[j].initial):
                        czech = False
                        for letter in czech_letters:
                            if rows[j].value.startswith(letter):
                                czech = True
                                break
                        if rows[j].value[0].isupper() or czech:
                            parts.add(rows[j])
                            rows[j].processed = True
                    else:
                        break

                rows[i].processed = True
                soff = parts[0].start_offset
                eoff = parts[-1].end_offset
                name = ''
                for p in parts:
                    name += p.value + ' '
                self.name_list.add(
                    DataRow.from_data(0, soff, eoff, name.strip()))

        for i in range(self.name_list.size()):
            if len(self.name_list[i].value.split(' ')) <= 1:
                self.name_list[i] = self.name_list[i].clear()

        self.name_list.clean()
        self.data_rows.clear_processed_flag()
    def __init__(self, data_base_pass):

        self.antecedent = [0 for x in range(data_base_pass.num_variables())]
        for i in range(0, len(self.antecedent)):
            # Don't care
            self.antecedent[i] = -1
        self.class_value = -1
        self.data_base = data_base_pass
        self.confident_value = 0.0
        self.support_value = 0.0
        self.nants = 0
        self.wracc = 0.0

        # print("__init__ of Rule")
        self.data_row_here = DataRow()
    def read_data(self):
        '''
        Reads file contents and strore them as list data_rows.
        '''

        if self.figa_doc.invalid():
            raise InputError('Figa output is either empty or missing!')

        if self.text_doc.invalid() or self.text_doc.empty():
            raise InputError('Source text is either empty or missing!')

        lines = self.figa_doc.get_lines()
        if lines == [] or lines == None:
            return
        else:
            for line in lines:
                self.data_rows.add(DataRow(line))
            self.data_rows.sort()
    def find_full_names(self):
        sentences = self.text_doc.content.split('.')
        names = []

        possible_name = []
        current_len = 0
        index = 0
        text = self.text_doc.content.replace('\n', ' ')
        for w in text.split(' '):
            if not w:
                current_len += len(w) + 1
                continue

            if w.endswith('.') or w.endswith(','):
                czech = False
                for letter in czech_letters:
                    if w.startswith(letter):
                        czech = True
                        break
                if (w[0].isupper() and len(w) > 1) or (czech and len(w) > 2):
                    possible_name.append(w[:-1])

                if possible_name and len(possible_name) > 1:
                    name = ' '.join(possible_name)
                    idr = DataRow.from_data(0, index + 1, index + len(name),
                                            name)
                    self.name_list.add(idr)
                possible_name = []
                index = 0

            else:
                czech = False

                for letter in czech_letters:
                    if w.startswith(letter):
                        czech = True
                        break

                if (w[0].isupper() and len(w) > 1) or (czech and len(w) > 2):
                    if not possible_name:
                        index = current_len

                    possible_name.append(w)

                else:
                    if len(possible_name) > 1:
                        name = ' '.join(possible_name)
                        idr = DataRow.from_data(0, index + 1,
                                                index + len(name), name)
                        self.name_list.add(idr)

                    possible_name = []
                    index = 0

            current_len += len(w) + 1

        if len(possible_name) > 1:
            name = ' '.join(possible_name)
            idr = DataRow.from_data(0, index + 1, index + len(name), name)
            self.name_list.add(idr)
        '''
    # replace all new line characters to preserve text structure in html
    output = output.replace('\n', '<br />')

    return output


if __name__ == '__main__':
    '''
    Main. No arguments, use stdin and stdout. Expects existence of figa.out file.
    '''

    fr = os.path.abspath('./figa.out')
    if not os.path.exists(fr):
        print('Could not find file "{}"!'.format(fr))
        sys.exit(1)

    f = open(fr)

    datarows = []
    for line in f.readlines():
        datarows.append(DataRow(line.strip()))

    text = sys.stdin.read()
    f.close()

    output = highlight_names(text, datarows)

    print(output)

# END highlight_names.py