def find_initials(self): ''' Tries to find name initials in the text (J., A., etc.) ''' text_parts = self.text_doc.content.split(' ') for p in text_parts: if len(p) == 2 and p[0].isupper() and p[1] == '.': self.initials.add(p) elif p.endswith('.'): for letter in czech_letters: if p.startswith(letter): self.initials.add(p) break self.initials = set(self.initials) for i in self.initials: rgx = re.compile('({})'.format(i.replace('.', '\.'))) matches = re.finditer(rgx, self.text_doc.content) for m in matches: idr = DataRow.from_data(10, m.start() + 1, m.end(), i) idr.initial = True self.data_rows.add(idr) self.data_rows.sort()
def prepare_data_rows(self, train): for i in range(0, train.size()): data_row_temp = DataRow() class_value = train.get_output_as_integer_with_pos(i) example = train.get_example(i) example_feature_array = [] for f_variable in range(0, self.n_variables): # print("The f_variable is :"+str(f_variable)) # print("The example is :" + str(example)) example_feature_array.append(train.get_example(f_variable)) label_array = [] for m in range(0, self.n_variables): max_value = 0.0 etq = -1 per = None n_labels = self.data_base.num_labels(m) print("n_labels: " + str(n_labels)) for n in range(0, n_labels): # print("Inside the second loop of searchForBestAntecedent......") print("example[" + str(m) + ")]: " + str(example[m])) per = self.data_base.membership_function(m, n, example[m]) print("per: " + str(per)) if per > max_value: max_value = per etq = n if max_value == 0.0: print( "There was an Error while searching for the antecedent of the rule" ) # print("Example: ") for n in range(0, self.n_variables): # print(str(example[n]) + "\t") pass print("Variable " + str(m)) exit(1) # print(" The max_value is : " + str(max_value)) # print(" ,the j value is : " + str(j)) label_array.append(etq) data_row_temp.set_three_parameters(class_value, example_feature_array, label_array) self.data_row_array.append(data_row_temp)
def fetchone(self, sql=None, params=()): if sql: self.execute(sql, params) rs = self._cursor.fetchone() if rs: desc = self._cursor.description column_map = dict((d[0], i) for i, d in enumerate(desc)) return DataRow(rs, column_map) return rs
def find_incomplete_names(self): ''' Search for unknown words after known words. ''' chars = ['\'', '.'] isep = ['.', ' '] # separators of initals, e.g. "Aaaa A.A." rows = self.data_rows for i in range(rows.size()): if rows[i].processed: continue else: next_name = rows[i + 1] if i < rows.size() - 1 else None if not rows.has_follower(i): start = rows[i].start_offset word = rows[i].value + ' ' word_start = False position = rows[i].end_offset + 1 if position >= self.text_doc.size(): break # Nothing else to be found if not self.text_doc.content[position].isupper(): continue else: tolerance = position + self.tolerance char = self.text_doc.content[position] while (char.isalpha() or char.isspace() or char in chars) and \ (position < tolerance or char != ' ') and (char != '\n'): if word_start and not char.isupper(): break # Not name, end loop else: word_start = False if char == ' ': word_start = True if char == '.': if not ((word[-2] == ' ' or word[-2] == '.') and \ word[-1].isupper()): break # Not initial word += char position += 1 char = self.text_doc.content[position] words = word.strip().split(' ') words = [x for x in words if x] name = [] for w in words: if w not in name: name.append(w) else: break name = ' '.join(name) end = start + len(name) - 1 self.name_list.add(DataRow.from_data(1, start, \ end, name)) self.name_list.sort() self.solve_conflicts()
def find_adjacent_names(self): ''' Loops through data rows and attempts to find names. ''' #if self.data_rows.empty(): # print('No data have beed read!') # sys.exit(1) rows = self.data_rows for i in range(rows.size()): if len(rows[i].parts) != 1: continue if not rows[i].value[0].isupper(): czech = False for letter in czech_letters: if rows[i].value.startswith(letter): czech = True break if not czech: continue parts = DataRowList.from_string(rows[i]) if rows[i].processed: continue else: for j in range(i + 1, rows.size()): if parts[-1].end_offset == rows[j].start_offset - 2 and \ (not parts.contains(rows[j].value) or rows[j].initial): czech = False for letter in czech_letters: if rows[j].value.startswith(letter): czech = True break if rows[j].value[0].isupper() or czech: parts.add(rows[j]) rows[j].processed = True else: break rows[i].processed = True soff = parts[0].start_offset eoff = parts[-1].end_offset name = '' for p in parts: name += p.value + ' ' self.name_list.add( DataRow.from_data(0, soff, eoff, name.strip())) for i in range(self.name_list.size()): if len(self.name_list[i].value.split(' ')) <= 1: self.name_list[i] = self.name_list[i].clear() self.name_list.clean() self.data_rows.clear_processed_flag()
def __init__(self, data_base_pass): self.antecedent = [0 for x in range(data_base_pass.num_variables())] for i in range(0, len(self.antecedent)): # Don't care self.antecedent[i] = -1 self.class_value = -1 self.data_base = data_base_pass self.confident_value = 0.0 self.support_value = 0.0 self.nants = 0 self.wracc = 0.0 # print("__init__ of Rule") self.data_row_here = DataRow()
def read_data(self): ''' Reads file contents and strore them as list data_rows. ''' if self.figa_doc.invalid(): raise InputError('Figa output is either empty or missing!') if self.text_doc.invalid() or self.text_doc.empty(): raise InputError('Source text is either empty or missing!') lines = self.figa_doc.get_lines() if lines == [] or lines == None: return else: for line in lines: self.data_rows.add(DataRow(line)) self.data_rows.sort()
def find_full_names(self): sentences = self.text_doc.content.split('.') names = [] possible_name = [] current_len = 0 index = 0 text = self.text_doc.content.replace('\n', ' ') for w in text.split(' '): if not w: current_len += len(w) + 1 continue if w.endswith('.') or w.endswith(','): czech = False for letter in czech_letters: if w.startswith(letter): czech = True break if (w[0].isupper() and len(w) > 1) or (czech and len(w) > 2): possible_name.append(w[:-1]) if possible_name and len(possible_name) > 1: name = ' '.join(possible_name) idr = DataRow.from_data(0, index + 1, index + len(name), name) self.name_list.add(idr) possible_name = [] index = 0 else: czech = False for letter in czech_letters: if w.startswith(letter): czech = True break if (w[0].isupper() and len(w) > 1) or (czech and len(w) > 2): if not possible_name: index = current_len possible_name.append(w) else: if len(possible_name) > 1: name = ' '.join(possible_name) idr = DataRow.from_data(0, index + 1, index + len(name), name) self.name_list.add(idr) possible_name = [] index = 0 current_len += len(w) + 1 if len(possible_name) > 1: name = ' '.join(possible_name) idr = DataRow.from_data(0, index + 1, index + len(name), name) self.name_list.add(idr) '''
# replace all new line characters to preserve text structure in html output = output.replace('\n', '<br />') return output if __name__ == '__main__': ''' Main. No arguments, use stdin and stdout. Expects existence of figa.out file. ''' fr = os.path.abspath('./figa.out') if not os.path.exists(fr): print('Could not find file "{}"!'.format(fr)) sys.exit(1) f = open(fr) datarows = [] for line in f.readlines(): datarows.append(DataRow(line.strip())) text = sys.stdin.read() f.close() output = highlight_names(text, datarows) print(output) # END highlight_names.py