コード例 #1
0
ファイル: negex.py プロジェクト: fcbertoldi/tcc
    def __init__(self, rules=None, pos_trigger_window=6, precedence=PRECEDENCE_DICT, trigger_tag_d=TRIGGER_TAG_DICT):
        self.pos_trigger_window = pos_trigger_window
        self.word_tokenizer = tokenizer.Tokenizer()
        self.precedence = precedence
        self.trigger_tag_dict = trigger_tag_d
        
        self.whitespace_pat = re.compile(r'\s+')
        self.termination_tag_pat = re.compile(r'PSEU|PREN|PREP|POST|POSP|CONJ', re.IGNORECASE)
        self.trigger_tag_pat = re.compile(r'PREN|PREP|POST|POSP', re.IGNORECASE)
        self.filler = '_'
        self.rules = list()
        self.triggers = list()

        if not rules:
            triggers_path = os.path.join(os.path.dirname(__file__), 'termos_disparadores.txt')
            rules = open(triggers_path)

        csv_rule = csv.reader(rules, delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')
        next(csv_rule) # skip header
        for rule in csv_rule:
            # Create rules patterns
            trigger = rule[0].strip()
            trigger_tokens = trigger.split()
            trig = r'\s+'.join(diacritical_charset_pattern(tok) for tok in trigger_tokens)
            rule.append( re.compile(r'\b(' + trig + r')\b', re.IGNORECASE) )
            self.rules.append(rule)
コード例 #2
0
def index_decs(laudo_groups, conn):
    """Index the sentences with DeCS descriptors.

    Argumentos:

    laudo_groups - sequence of sequences, where each sequence is the set of all sentences in a laudo.
    conn - a psycopg2 connection object
    """
    DecsRecord = namedtuple('DecsRecord', 'id descritor')
    

    cur = conn.cursor()
    cur.execute("SELECT id, descritor FROM descritor.decs WHERE id LIKE 'A%' OR id LIKE 'B%' OR id LIKE 'C%' OR id LIKE 'D%' OR id LIKE 'E%'")
    decs_query_result = [DecsRecord._make(row) for row in cur.fetchall()]
    # sort by the number of words in the descriptor, in descending order
    decs_query_result.sort(key=lambda row: len(row.descritor.split()), reverse=True)
    decs_patterns = list()
    for row in decs_query_result:
        regex_str = r'\b' + r'\s+'.join(diacritical_charset_pattern(tok) for tok in row.descritor.split()) + r'\b'
        decs_patterns.append(re.compile(regex_str, re.I))

    indexed_laudos = list()
    for laudo_l in laudo_groups:
        indexed_laudo = list()
        for sent in laudo_l:
            indexed_sent = IndexedSentence(sent)
            for decs_row, decs_pat in zip(decs_query_result, decs_patterns):
                for match in decs_pat.finditer(sent):
                    index = (match.start(), match.end())
                    insert_index_success = indexed_sent.insertIndex(index)
                    if insert_index_success:
                        indexed_sent.insertIndexId(index, decs_row.id)

            indexed_laudo.append(indexed_sent)

        indexed_laudos.append(indexed_laudo)

    return indexed_laudos