def __init__(self, rules=None, pos_trigger_window=6, precedence=PRECEDENCE_DICT, trigger_tag_d=TRIGGER_TAG_DICT): self.pos_trigger_window = pos_trigger_window self.word_tokenizer = tokenizer.Tokenizer() self.precedence = precedence self.trigger_tag_dict = trigger_tag_d self.whitespace_pat = re.compile(r'\s+') self.termination_tag_pat = re.compile(r'PSEU|PREN|PREP|POST|POSP|CONJ', re.IGNORECASE) self.trigger_tag_pat = re.compile(r'PREN|PREP|POST|POSP', re.IGNORECASE) self.filler = '_' self.rules = list() self.triggers = list() if not rules: triggers_path = os.path.join(os.path.dirname(__file__), 'termos_disparadores.txt') rules = open(triggers_path) csv_rule = csv.reader(rules, delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\') next(csv_rule) # skip header for rule in csv_rule: # Create rules patterns trigger = rule[0].strip() trigger_tokens = trigger.split() trig = r'\s+'.join(diacritical_charset_pattern(tok) for tok in trigger_tokens) rule.append( re.compile(r'\b(' + trig + r')\b', re.IGNORECASE) ) self.rules.append(rule)
def index_decs(laudo_groups, conn): """Index the sentences with DeCS descriptors. Argumentos: laudo_groups - sequence of sequences, where each sequence is the set of all sentences in a laudo. conn - a psycopg2 connection object """ DecsRecord = namedtuple('DecsRecord', 'id descritor') cur = conn.cursor() cur.execute("SELECT id, descritor FROM descritor.decs WHERE id LIKE 'A%' OR id LIKE 'B%' OR id LIKE 'C%' OR id LIKE 'D%' OR id LIKE 'E%'") decs_query_result = [DecsRecord._make(row) for row in cur.fetchall()] # sort by the number of words in the descriptor, in descending order decs_query_result.sort(key=lambda row: len(row.descritor.split()), reverse=True) decs_patterns = list() for row in decs_query_result: regex_str = r'\b' + r'\s+'.join(diacritical_charset_pattern(tok) for tok in row.descritor.split()) + r'\b' decs_patterns.append(re.compile(regex_str, re.I)) indexed_laudos = list() for laudo_l in laudo_groups: indexed_laudo = list() for sent in laudo_l: indexed_sent = IndexedSentence(sent) for decs_row, decs_pat in zip(decs_query_result, decs_patterns): for match in decs_pat.finditer(sent): index = (match.start(), match.end()) insert_index_success = indexed_sent.insertIndex(index) if insert_index_success: indexed_sent.insertIndexId(index, decs_row.id) indexed_laudo.append(indexed_sent) indexed_laudos.append(indexed_laudo) return indexed_laudos