コード例 #1
0
    def __init__(self, partial_match=False, ignorecase=True, stopwords=None):
        '''
        :param partial_match:  allow for matching a non clomplete word
        :param ignorecase: case sensitive or not
        :param stopwords: stopwords to skip, defaults to a very broad list
        '''
        self.A = ahocorasick.Automaton()
        self.partial_match = partial_match
        self.ignorecase = ignorecase
        if stopwords is None:
            stopwords = DOMAIN_STOP_WORDS
        idx = 0
        '''get the dictionaries from remote files'''
        vocabpath = os.path.dirname(os.path.abspath(__file__)) + "/vocabulary/"
        for dictionary_url in vocabulary_urls:
            filename = dictionary_url.split('/')[-1]
            category, reference_db = filename.split('.')[0].split(
                '_')[0].split('-')
            with open(vocabpath + filename) as f:
                dictionary = json.load(f)
            '''load the elements in the Automation if they are not too short or are stopwords'''
            for element, element_data in list(dictionary.items()):
                ids = element_data['ids']
                pref_name = element_data['pref_name']
                if len(element) > 2:
                    element_str = element
                    if ((len(element_str) < 5) and
                        (element_str not in stopwords)
                            or (len(element_str) >= 5) and
                        (element_str.lower() not in stopwords)):
                        idx += 1
                        if self.ignorecase:
                            element_match = element_str.lower()
                        else:
                            element_match = element_str
                        try:
                            self.add_tag(element_match, idx, category,
                                         reference_db, [i for i in ids],
                                         element, element_match, pref_name)
                        except TypeError as e:
                            print(element_match)
                            print(type(element_match))
                            raise e
                        '''handle elements with dashes by also creating a copy without'''
                        if '-' in element_match:
                            element_match_without_dash = element_match.replace(
                                '-', '')
                            if len(element_match_without_dash) > 2:
                                self.add_tag(element_match_without_dash, idx,
                                             category, reference_db,
                                             [i for i in ids], element,
                                             element_match_without_dash,
                                             pref_name)
                        '''if supporting partial match'''
                        if self.partial_match:
                            for longest_token in element.split():
                                if longest_token != element and \
                                   len(longest_token) > 5 and \
                                   longest_token.lower() not in stopwords:
                                    self.add_tag(longest_token, idx,
                                                 category + '-TOKEN',
                                                 reference_db,
                                                 [i for i in ids], element,
                                                 longest_token, pref_name)

        self.A.make_automaton()
コード例 #2
0
 def setUp(self):
     self.A = ahocorasick.Automaton()
     self.words = ['GT-C3303', 'SAMSUNG-GT-C3303K/']
コード例 #3
0
 def test_save_ints(self):
     A = ahocorasick.Automaton(ahocorasick.STORE_INTS)
     with self.assertRaisesRegex(ValueError,
                                 "expected exactly one argument"):
         A.save(self.path, None)
コード例 #4
0
def updatePKL(path, allelesDB):
	allelesAC = ahocorasick.Automaton()
	for idx,key in enumerate(allelesDB):
		allelesAC.add_word(key, (idx, key))
	allelesAC.make_automaton()
	pickle.dump((allelesDB, allelesAC), open(path+'/'+'pyngSTar_alleles_AC.pkl', 'wb'))
コード例 #5
0
 def setUp(self):
     self.A = ahocorasick.Automaton(ahocorasick.STORE_LENGTH)
     self.words = "word python aho corasick \x00\x00\x00".split()
コード例 #6
0
ファイル: parser.py プロジェクト: kushalc/coreutils
def build_aho_matcher(iterable, label):
    A = aho.Automaton()
    for ix, word in enumerate(iterable):
        A.add_word(word, (word, label))
    A.make_automaton()
    return A
コード例 #7
0
 def build_actree(self, wordlist):
     actree = ahocorasick.Automaton()
     for index, word in enumerate(wordlist):
         actree.add_word(word, (index, word))
     actree.make_automaton()
     return actree
コード例 #8
0
 def test_constructor_wrong_store(self):
     with self.assertRaisesRegex(ValueError, "store value must be one of.*"):
         ahocorasick.Automaton(-42)
コード例 #9
0
 def test_constructor_wrong_key_type(self):
     with self.assertRaisesRegex(ValueError, "key_type must have value.*"):
         ahocorasick.Automaton(ahocorasick.STORE_ANY, -42)
コード例 #10
0

@app.after_request
def after_request(response):
    if not hasattr(g, 'request_start_time'):
        return response
    elapsed = time.time() - g.request_start_time
    req_info = str(
        g.request_start_time) + ": " + request.method + "_" + request.url
    app.logger.debug(req_info + ":" + ' time_used:' + str(elapsed))
    return response


# metrics only end
# update sensitive
sensitive_words = ahocorasick.Automaton()


def load_sensitive_word():
    try:
        lines = open(BaseConfig.WORD_PATH, 'r').readlines()
        for line in lines:
            line = line.strip()
            if not line:
                continue
            sensitive_words.add_word(line, line)
        sensitive_words.make_automaton()
        return True
    except Exception as e:
        app.logger.error("load_sensitive_word exception:{}".format(e))
        return False
コード例 #11
0
def get_automaton() -> Automaton:
    import ahocorasick
    return ahocorasick.Automaton()
コード例 #12
0
ファイル: ate.py プロジェクト: gendobr/snowball
    def c_values(self, terms, trace=False):
        terms_df = pd.DataFrame(terms, columns=['term'])
        terms_df['w'] = 1
        terms_df['len'] = len(terms_df['term'])
        """
        terms_df is
                                term  w    len
        0        feature hierarchies  1  20552
        1   rich feature hierarchies  1  20552
        2  accurate object detection  1  20552
        3      semantic segmentation  1  20552
        4             ross girshick1  1  20552

        w is always "1"
        len is number of candidate terms
        """

        term_stats = terms_df.groupby(['term'])['w'].agg([np.sum])
        term_stats['len'] = list(pd.Series(term_stats.index).apply(lambda x: len(x)))
        """
        term_stats is
        term                                                sum  len                                                        
        1 i1 r                                                1    6
        1000class imagenet benchmark                          1   28
        1000class imagenet large scale visual recogniti...    1   59
        1000class imagenet object recognition challenge       1   47
        1000class imagenet task                               1   23
        
        "term" is candidate term, primary key, values of "term" column are unique 
        "sum"  is term frequency
        "len"  is length of candidate term
        """

        # term_series is list of candidate terms
        term_series = list(term_stats.index)

        # n_terms is number of candidate terms
        n_terms = len(term_series)

        # all spaces to simplify calculation
        for i in range(0, n_terms):
            term_series[i] = ' ' + str(term_series[i]) + ' '

        # replace index
        term_stats['trm'] = term_series
        term_stats.set_index('trm', inplace=True)

        # create finite state automata
        A = ahocorasick.Automaton()
        for i in range(0, n_terms):
            A.add_word(term_series[i], (i, term_series[i]))
        A.make_automaton()

        is_part_of = []
        for i in range(0, n_terms):
            haystack = term_series[i]
            for end_index, (insert_order, original_value) in A.iter(haystack):
                if original_value != haystack:
                    # print original_value, "insideof ", haystack
                    is_part_of.append((original_value, haystack, 1))
        subterms = pd.DataFrame(is_part_of, columns=['term', 'part_of', 'w']).set_index(['term', 'part_of'])
        """
        subterms is
        
        term                                               part_of                                             w 
         imagenet benchmark                                 1000class imagenet benchmark                       1
         imagenet large                                     1000class imagenet large scale visual recognit...  1
         large scale                                        1000class imagenet large scale visual recognit...  1
         visual recognition                                 1000class imagenet large scale visual recognit...  1
         imagenet large scale visual recognition challe...  1000class imagenet large scale visual recognit...  1
        ...                                                                                                   ..
         reconstruction loss                                ℓ1 reconstruction loss                             1
         error                                              ℓ2 error                                           1
         reconstruction                                     ℓ2 pixelwise reconstruction loss                   1
         pixelwise reconstruction loss                      ℓ2 pixelwise reconstruction loss                   1
         reconstruction loss                                ℓ2 pixelwise reconstruction loss                   1

        "w" is term frequency
        """
        # print("-------------")
        # print(subterms)

        if trace:
            print("terms/subterms relations discovered ...")

        c_values = []
        # term_series=['time']
        for t in term_series:
            if t in term_stats.index:
                current_term = term_stats.loc[t]
                """
                print("-------------")
                print(('t', t, 'current_term', current_term))

                ('t', ' belief network ', 
                'current_term', sum     1
                                len    14
                                Name:  belief network , dtype: int64)
                t is string
                current_term = {sum:1, len:14}
                """

                # calculate average frequency of the superterms
                c_value = 0
                if t in subterms.index:
                    subterm_of = list(subterms.loc[t].index)
                    """
                    print(('subterm_of', subterm_of))
                    ('subterm_of', [' deep belief network ', ' directed sigmoid belief network ', ' sigmoid belief network '])

                    """
                    for st in subterm_of:
                        # term_stats.loc[st]['sum'] is frequency of superterm
                        c_value -= term_stats.loc[st]['sum']
                    c_value /= float(len(subterm_of))

                # add current term frequency
                c_value += current_term['sum']

                # multiply to log(term length)
                c_value = c_value * np.log(current_term['len']) if current_term['len'] > 0 else 0
                if trace:
                    print(t, 'freq=', current_term['sum'], ' cvalue=', c_value)
                c_values.append(c_value)
                # break

        """
        returns sorted list of tuples (candidate_term, Cvalue)
        """
        return sorted(zip([x.strip() for x in term_series], c_values), key=lambda x: x[1], reverse=True)
コード例 #13
0
def acPass(wordBlockSize, blockBoundaries):

    queries = {
        i: " ".join(sttsp[i * blockBoundaries:i * blockBoundaries +
                          wordBlockSize])
        for i in range(0,
                       len(sttsp) // blockBoundaries)
    }
    # math is hard but this gets an extra query at the end. bonus. nice.

    # remove duplicate queries (keep the first occurrance)
    seenQueries = set()
    uniqQueries = {}
    for i, q in queries.items():
        if q in seenQueries: continue
        seenQueries.add(q)
        uniqQueries[i] = q
    print("block len:",
          wordBlockSize,
          ", block interval:",
          blockBoundaries,
          "=> num queries:",
          len(queries),
          end="\n")

    print("removed duplicate queries:",
          str(len(queries) - len(uniqQueries)) + "/" + str(len(queries)))

    ac = ahocorasick.Automaton()
    for i, q in uniqQueries.items():
        ac.add_word(q, (i, q))
        # gaps in index are fine and are needed to restore the original string positioning in splitInds (we removed some of the indexes)
    ac.make_automaton()

    # print( joinedLansp[ 203 : 251+1 ] )
    # print( queries[ 11 ] )
    # print( sttsp[ 11*blockBoundaries : 11*blockBoundaries + wordBlockSize ] )

    # lit = ( lansp[ len( joinedLansp[:203].split() ) : len( joinedLansp[:251].split() ) ] )
    # af = ( lanspinds[ len( joinedLansp[:203].split() ) : len( joinedLansp[:251].split() ) ] )

    # print( "in lan:", lan[af[0][0]:af[-1][1]+1] )
    # print( "in stt:", stt[sttspinds[11*blockBoundaries][0] : sttspinds[11*blockBoundaries+wordBlockSize-1][1]+1] )

    # take an aho corasick match, expand it, and return char ranges, split string ranges, and original texts for both lan and stt, in addition to the matchText
    def splitInds(joinedLanspStart, joinedLanspEnd, queryIdx):
        # print( "\nSPLIT INDS", joinedLanspStart, joinedLanspEnd, queryIdx, "`" + joinedLansp[joinedLanspStart:joinedLanspEnd] + "`", queries[ queryIdx ] )

        # joinedLansp[ joinedLanspStart : joinedLanspEnd + 1 ] == queries[ queryIdx ] is the matching text
        sttspindRange = (queryIdx * blockBoundaries,
                         min(queryIdx * blockBoundaries + wordBlockSize,
                             len(sttspinds)))
        # gives you original stt char indexes for each split word
        lanspindRange = (len(joinedLansp[:joinedLanspStart - 1].split(" ")),
                         min(len(joinedLansp[:joinedLanspEnd].split(" ")),
                             len(lanspinds)))

        # if joinedLanspStart:joinedLanspEnd aren't on word boundaries, the queryIdx is matching in joinedLansp in the middle of a word, so the match has to contract until they're on word boundaries because reasons; bug where it matched [ khi, ... ] in the middle of cókhi in lan
        if sttsp[sttspindRange[0]] != lansp[lanspindRange[0]]:
            sttspindRange = (sttspindRange[0] + 1, sttspindRange[1])
            # this seemed to fix it for every case that i have hit, should keep the following check just in case; since joinedLanspStart and joinedLanspEnd aren't used and since the expansion process happens anyway, it's fine to modify in this way

        if " ".join(sttsp[slice(*sttspindRange)]) != " ".join(
                lansp[slice(*lanspindRange)]):
            print("!!! FAILED FOR SOME REASON !!!", "stt:", sttspindRange,
                  "lan:", lanspindRange)
            print("sttspindRange joined:",
                  " ".join(sttsp[slice(*sttspindRange)]))
            print("lanspindRange joined:",
                  " ".join(lansp[slice(*lanspindRange)]))
            print(
                "stt",
                list(enumerate(sttsp))[sttspindRange[0] - 10:sttspindRange[1] +
                                       10], "\nlan",
                list(enumerate(lansp))[lanspindRange[0] - 10:lanspindRange[1] +
                                       10])

        # print( 0, " ".join( sttsp[ sttspindRange[0] : sttspindRange[1] ] ), " ".join( lansp[ lanspindRange[0] : lanspindRange[1] ] ), sep="\n" ); # print the match

        # expand spindRanges
        while sttspindRange[0] - 1 >= 0 and lanspindRange[0] - 1 >= 0 and sttsp[
                sttspindRange[0] - 1] == lansp[lanspindRange[0] - 1]:
            sttspindRange = (sttspindRange[0] - 1, sttspindRange[1])
            lanspindRange = (lanspindRange[0] - 1, lanspindRange[1])
            # print( 1, str(sttspindRange) + " : " + " ".join( sttsp[ sttspindRange[0] : sttspindRange[1] ] ), str(lanspindRange) + " : " + " ".join( lansp[ lanspindRange[0] : lanspindRange[1] ] ), sep="\n" ); # print the match
        while sttspindRange[1] < len(sttsp) and lanspindRange[1] < len(
                lansp) and sttsp[sttspindRange[1]] == lansp[
                    lanspindRange[1]]:  # right range index is noninclusive
            sttspindRange = (sttspindRange[0], sttspindRange[1] + 1)
            lanspindRange = (lanspindRange[0], lanspindRange[1] + 1)
            # print( 2, str(sttspindRange) + " : " + " ".join( sttsp[ sttspindRange[0] : sttspindRange[1] ] ), str(lanspindRange) + " : " + " ".join( lansp[ lanspindRange[0] : lanspindRange[1] ] ), sep="\n" ); # print the match
        # print()

        sttOrigCharRange = (sttspinds[sttspindRange[0]][0],
                            sttspinds[sttspindRange[1] - 1][1] + 1)
        lanOrigCharRange = (lanspinds[lanspindRange[0]][0],
                            lanspinds[lanspindRange[1] - 1][1] + 1)

        return { "stt": { "spindRange": sttspindRange, "origCharRange": sttOrigCharRange, "text": stt[ sttOrigCharRange[0] : sttOrigCharRange[1] ] }, \
           "lan": { "spindRange": lanspindRange, "origCharRange": lanOrigCharRange, "text": lan[ lanOrigCharRange[0] : lanOrigCharRange[1] ] }, \
           "matchText": queries[ queryIdx ] }

    # pprint( splitInds( 219, 267, 12 ) )

    # for joinedLanspEnd, (queryIdx, query) in A.iter( joinedLansp ):
    # 	joinedLanspStart = joinedLanspEnd - len(query) + 1
    # 	print(joinedLanspStart, "-", joinedLanspEnd, ",", queryIdx, ",", query, "//", str( joinedLanspStart * 100 / len( lan ) ) + "%")
    # 	print( splitInds( joinedLanspStart, joinedLanspEnd, queryIdx ) )

    # this needs to check if any return matched multiple

    acRet = [(joinedLanspEnd, (queryIdx, query))
             for joinedLanspEnd, (queryIdx, query) in ac.iter(joinedLansp)]
    # print(acRet)
    acRetQIdxCounts = Counter(a[1][0] for a in acRet)
    print("removed multimatches:",
          [(q, cnt, queries[q])
           for q, cnt in acRetQIdxCounts.items() if cnt != 1])
    acRet = [a for a in acRet if acRetQIdxCounts[a[1][0]] == 1]
    # strip all matches which don't occur strictly once; fully remove all duplicates

    matchingSegments = [
        splitInds(joinedLanspEnd - len(query) + 1, joinedLanspEnd + 1,
                  queryIdx) for joinedLanspEnd, (queryIdx, query) in acRet
    ]

    print("num matches:",
          str(len(matchingSegments)) + "/" + str(len(queries)))

    # merge overlapping / duplicated matchings (since they've each been expanded in splitInds)
    duplicateStore = set()
    matchToRangeSet = lambda m: (*m["lan"]["origCharRange"], *m["lan"][
        "spindRange"], *m["stt"]["origCharRange"], *m["stt"]["spindRange"])
    matchingSegments = [
        m for m in matchingSegments if matchToRangeSet(m) not in duplicateStore
        and not duplicateStore.add(matchToRangeSet(m))
    ]
    # thanks https://stackoverflow.com/a/4463433

    # pprint( matchingSegments[:3] )
    return matchingSegments
コード例 #14
0
ファイル: wenshu.py プロジェクト: shishi11/nlp
    def __init__(self):
        # print(text)
        # self.text=text
        # self.textProcess=TextProcess()
        self.aho_policical_person = ahocorasick.Automaton()
        self.regulars = []
        #应该有三四种抽取方式
        #这是触发词+依存
        self.aho_policical_person.add_word(
            '申请执行人',
            [
                {
                    'word': '申请执行人',
                    'key': '申请执行人',
                    'field': 6,
                    'ekey': 'claimant',
                    'rel': '申请执行人[)]?[:|:]?(.*?)[,|,][男|女|住]',
                    'extrctor': 'regular'
                },
                # {'key': '申请执行人', 'field': 6, 'ekey': 'claimant', 'rel': '定中关系',
                #  'extrctor': 'pd'},
            ])
        self.aho_policical_person.add_word(
            '被执行人',
            [
                {
                    'word': '被执行人',
                    'key': '被执行人',
                    'field': 7,
                    'ekey': 'respondent',
                    'rel': '被执行人[)]?[:|:]?(.*?)[,|,][男|女|住]',
                    'extrctor': 'regular'
                },
                # {'key': '被执行人', 'field': 7, 'ekey': 'respondent', 'rel': '定中关系',
                #  'extrctor': 'pd'}
            ])
        self.aho_policical_person.add_word('审判员', [{
            'word': '审判员',
            'key': '审判员',
            'field': 12,
            'ekey': 'judger',
            'rel': '审判员(.*)',
            'extrctor': 'regular'
        }])
        self.aho_policical_person.add_word('审判长', [{
            'word': '审判长',
            'key': '审判员',
            'field': 12,
            'ekey': 'judger',
            'rel': '审判长(.*)',
            'extrctor': 'regular'
        }])
        self.aho_policical_person.add_word('执行员', [{
            'word': '执行员',
            'key': '审判员',
            'field': 12,
            'ekey': 'judger',
            'rel': '审判长(.*)',
            'extrctor': 'regular'
        }])
        m = r'((?:[一二三四五六七八九十千万亿兆幺零百壹贰叁肆伍陆柒捌玖拾佰仟]+(?:点[一二三四五六七八九幺零]+){0,1})|\d+(?:[\.,,]\d+)*万?)元'
        # ' ((?:[一二三四五六七八九十千万亿兆幺零百]+(?:点[一二三四五六七八九幺零]+){0,1})|\d+(?:[\.,,]\d+)+万?)元'
        m1 = r'(?:支付|给付|清偿|偿还|^(?:(?!未).)*执行到位).*?' + m
        #清偿 偿还  执行到位
        #标的还剩,XXX未执行到位 未执行到位
        #
        shean = [{
            'key': '标的',
            'field': 8,
            'ekey': 'judger',
            'word': '标的',
            'rel': m1,
            'extrctor': 'regular'
        }]
        self.aho_policical_person.add_word('支付', shean)
        self.aho_policical_person.add_word('给付', shean)
        self.aho_policical_person.add_word('清偿', shean)
        self.aho_policical_person.add_word('偿还', shean)
        self.aho_policical_person.add_word('退赔', shean)
        self.aho_policical_person.add_word('执行到位', [{
            'key': '标的',
            'field': 8,
            'ekey': 'judger',
            'word': '执行到位',
            'rel': '执行到位.*?' + m,
            'extrctor': 'regular'
        }])
        self.aho_policical_person.add_word('执行标的', [{
            'key': '标的',
            'field': 8,
            'ekey': 'judger',
            'word': '执行标的',
            'rel': '执行标的.*?' + m,
            'extrctor': 'regular'
        }])
        self.aho_policical_person.add_word('执行总标的', [{
            'key': '标的',
            'field': 8,
            'ekey': 'judger',
            'word': '执行总标的',
            'rel': '执行总标的.*?' + m,
            'extrctor': 'regular'
        }])
        # self.aho_policical_person.add_word('')

        #数据结构:
        # 0审理法院、 必      ok
        # 1案件类型、 必      ok
        # 2案由、      必       ok
        # 3文书类型、  必     ok
        # 4案号、      必       ok
        # 5裁判日期、  必     ok
        # 6申请执行人、  必    ok
        # 7被执行人[],  必       ok
        # 8标的金额,    选
        # 9执行金额,    选
        # 10是否执行完,  必
        # 11是否终结本次,    必    ok
        # 12审判员,      必
        # 13相关判决书,      必
        # 14细类型    选-----》追加或撤销追加,撤回执行,终结审查程序,驳回  移送  中止  变更申请人
        self.aho_policical_person.make_automaton()

        #预读取数据
        try:
            self.casepd = pandas.read_csv('./resources/case_detail_list.csv',
                                          index_col=0)
        except:
            self.casepd = pandas.DataFrame()
        # self.meta_person=pandas.DataFrame(columns=['name','detail'])

        try:
            self.meta_company = pandas.read_csv('./resources/company_list.csv',
                                                index_col=0)
            self.meta_company = self.meta_company.drop_duplicates(subset=None)
        except:
            self.meta_company = pandas.DataFrame(columns=['name', 'detail'])
        self.moneysent = []
コード例 #15
0
 def __init__(self):
     self.ac = ahocorasick.Automaton()
コード例 #16
0
ファイル: index.py プロジェクト: pombredanne/findlicense
    def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
        """
        Initialize the index with an iterable of Rule objects.
        """
        # total number of unique known tokens
        self.len_tokens = 0

        # largest token ID for a "junk" token. A token with a smaller id than
        # len_junk is considered a "junk" very common token
        self.len_junk = 0
        # corresponding number of non-junk tokens: len_tokens = len_junk + len_good
        self.len_good = 0

        # mapping of token string > token id
        self.dictionary = {}

        # mapping of token id -> token string as a list where the index is the
        # token id and the value the actual token string
        self.tokens_by_tid = []

        # Note: all mappings of rid-> data are lists of data where the index
        # is the rule id.

        # rule objects proper
        self.rules_by_rid = []

        # token_id sequences
        self.tids_by_rid = []

        # mapping of rule id->(mapping of (token_id->[positions, ...])
        # We track only high/good tokens there. This is a "traditional"
        # positional inverted index
        self.high_postings_by_rid = []

        # mapping of rule_id -> tuple of low and high tokens ids sets/multisets
        # (low_tids_set, high_tids_set)
        self.tids_sets_by_rid = []
        # (low_tids_mset, high_tids_mset)
        self.tids_msets_by_rid = []

        # mapping of hash -> single rid : duplicated rules are not allowed
        self.rid_by_hash = {}

        # Aho-Corasick automatons for negative and small rules
        self.rules_automaton = ahocorasick.Automaton(ahocorasick.STORE_ANY)
        self.negative_automaton = ahocorasick.Automaton(ahocorasick.STORE_ANY)

        # disjunctive sets of rule ids: regular, negative, small, false positive
        self.regular_rids = set()
        self.negative_rids = set()
        self.small_rids = set()
        self.false_positive_rids = set()

        # length of the largest false_positive rule
        self.largest_false_positive_length = 0
        # mapping of hash -> rid for false positive rule tokens hashes
        self.false_positive_rid_by_hash = {}

        # if True the index has been optimized and becomes read only:
        # no new rules can be added
        self.optimized = False

        if rules:
            if TRACE_INDEXING_PERF:
                start = time()
                print('LicenseIndex: building index.')

            # index all and optimize
            self._add_rules(rules, _ranked_tokens)

            if TRACE_INDEXING_PERF:
                duration = time() - start
                len_rules = len(self.rules_by_rid)
                print(
                    'LicenseIndex: built index with %(len_rules)d rules in %(duration)f seconds.'
                    % locals())
                self._print_index_stats()
コード例 #17
0
	def setUp(self):
		self.A = ahocorasick.Automaton();
		self.words = "word python aho corasick \x00\x00\x00".split()
		self.inexisting = "test foo bar dword".split()
コード例 #18
0
def _get_entity_text(train, flag='train'):
    if flag == 'train':
        print('*' * 20 + 'neurips 3321 + stage1_test 368' + '*' * 20)
    else:
        print('*' * 20 + 'stage 2 986' + '*' * 20)
    gene_list = set(train['Gene'].tolist())
    var_list = set(train['Variation'].tolist())
    print('unique gene in ' + flag + ':', len(gene_list),
          ' unique variation in ' + flag + ':', len(var_list))

    sentences = set(
        list(chain.from_iterable(
            train['Text'].map(lambda x: sent_tokenize(x)))))
    print('number of unique sentences in all samples:', len(sentences))

    print('Building gene tree...')
    gene_tree = ahocorasick.Automaton()
    for idx, gene in enumerate(gene_list):
        gene_tree.add_word(gene.lower(), (idx, gene.lower()))

    gene_tree.make_automaton()

    print('Assinging gene occurrence sentences to gene tree...')
    gene_dict = dict()
    for sent in sentences:
        word_list = word_tokenize(sent.lower())
        for word in word_list:
            if word in gene_tree:
                if word in gene_dict:
                    gene_dict[word] += sent
                else:
                    gene_dict[word] = sent

    print('Writing gene occurrence sentences to file...')
    df_gene = pd.DataFrame.from_dict(gene_dict, orient='index').reset_index()
    df_gene.columns = ['Gene', 'Text']
    df_gene.to_csv('data/intermediate/' + flag + '_gene_text',
                   index=False,
                   sep='|')

    print('Building variation tree...')
    var_tree = ahocorasick.Automaton()
    for idx, var in enumerate(var_list):
        var_tree.add_word(var.lower(), (idx, var.lower()))

    var_tree.make_automaton()

    print('Assinging variation occurrence sentences to variation tree...')
    var_dict = dict()
    for sent in sentences:
        word_list = word_tokenize(sent.lower())
        for word in word_list:
            if word in var_tree:
                if word in var_dict:
                    var_dict[word] += sent
                else:
                    var_dict[word] = sent

    print('Writing variation occurrence sentences to file...')
    df_gene = pd.DataFrame.from_dict(var_dict, orient='index').reset_index()
    df_gene.columns = ['Variation', 'Text']
    df_gene.to_csv('data/intermediate/' + flag + '_variation_text',
                   index=False,
                   sep='|')
コード例 #19
0
 def build_ac_automation_from_strings(cls, keys: List[str]) -> Any:
     atm = ahocorasick.Automaton()  # pylint: disable=c-extension-no-member
     for idx, key in enumerate(keys):
         atm.add_word(key, (idx, key))
     atm.make_automaton()
     return atm
コード例 #20
0
ファイル: tag_doc.py プロジェクト: lnnnnnn/KGSearch
 def __init__(self, tags):
     self.tree = ahocorasick.Automaton()
     for tag in tags:
         self.tree.add_word(TagsTrie.cut_sent(tag), tag)
     self.tree.make_automaton()
コード例 #21
0
def make_pyahocorasick_automaton(patterns):
    automaton = ahocorasick.Automaton()
    for key in patterns:
        automaton.add_word(key, key)
    automaton.make_automaton()
    return automaton
コード例 #22
0
    def run(self):
        self.A = ahocorasick.Automaton()

        self.add_words()
        self.remove()
コード例 #23
0
#!/usr/bin/python
# -*- coding:utf-8 -*-

import ahocorasick
A = ahocorasick.Automaton()

A.add_word('最优秀', (0, '最优秀'))
A.add_word('最佳', (0, '最佳'))
A.add_word('最好', (0, '最好'))
# A.add_word('c', (1, 'c'))

# if 'hahhahah' in  A :
#     print("TRUE")
# else:
#     print("TRUE")


A.make_automaton()
# a=0
# for item in A.iter("_hahhahahhahhahhaheeeee_"):
#     a=a+1
 

# import  mysql.connector
# config = {
#     'host': '121.42.33.20',
#     'user': '******',
#     'password': '******',
#     'port': 8306,
#     'database': 'htmp',
#     'charset': 'utf8'
コード例 #24
0
relative_paths = glob.iglob('%s/**/*.tex' % target_dir, recursive=True)
relative_paths = set(
    [path.replace(target_dir, '').lstrip('/') for path in relative_paths])

ignore_paths = [glob.glob(path) for path in args.exclude]
ignore_paths = set(chain.from_iterable(ignore_paths))

stats = defaultdict(lambda: 0)

# *****************************************************************************
# * Build Aho-Corasick tree
# *****************************************************************************

print("==> Preparing Aho-Corasick concept matcher", file=sys.stderr)
matcher = ahocorasick.Automaton()
with open(args.concepts) as con_f:
    reader = csv.DictReader(con_f)
    for idx, row in enumerate(reader):
        matcher.add_word(row['match'], (row['match'], row['concept']))
        stats['index terms'] += 1
matcher.make_automaton()

# *****************************************************************************
# * Annotate index entries
# *****************************************************************************

print("==> Inserting \\index{...} entries", file=sys.stderr)

stats['annotation distribution'] = defaultdict(lambda: 0)
コード例 #25
0
 def setUp(self):
     self.A = ahocorasick.Automaton()
     words = "word python aho corasick tree bark branch root".split()
     for word in words:
         self.A.add_word(conv(word), 1)
コード例 #26
0
ファイル: lexicon.py プロジェクト: yorick76ee/MiNLP
 def __init__(self, file_or_list=None):
     self.ac = ahocorasick.Automaton(ahocorasick.STORE_LENGTH)
     if file_or_list:
         self.add_words(file_or_list)
     self.interfere_factor = DEFAULT_INTERFERE_FACTOR
コード例 #27
0
 def setUp(self):
     self.A = ahocorasick.Automaton(ahocorasick.STORE_ANY,
                                    ahocorasick.KEY_SEQUENCE)
コード例 #28
0
}

# function words
f_func_words = open(FUNCTION_WORDS_FILE_PATH, "r", encoding="utf-8")
func_words = f_func_words.readlines()

FUNCTION_WORDS_LIST = [word.rstrip() for word in func_words]

# fixed expressions
f_fixed_expression = open(FIXED_EXPRESSIONS_FILE_PATH, "r", encoding="utf-8")
fixed_expressions = f_fixed_expression.readlines()

FIXED_EXPRESSIONS_LIST = [word.rstrip() for word in fixed_expressions]

# AC Automaton
AC_AUTOMATON = ahocorasick.Automaton()
for idx, key in enumerate(FIXED_EXPRESSIONS_LIST):
    AC_AUTOMATON.add_word(key, (idx, key))
AC_AUTOMATON.make_automaton()

# BPE
SPM = spm.SentencePieceProcessor()
SPM.Load(BPE_MODEL_PATH)

# special tokens
SPECIAL_TOKENS = {
    "pad": "<pad>",
    "oov": "<oov>",
    "sos": "<sos>",
    "eos": "<eos>"
}
コード例 #29
0
    def test_get_from_an_empty_automaton(self):
        A = ahocorasick.Automaton()

        r = A.get('foo', None)
        self.assertEqual(r, None)
コード例 #30
0
def build_sid_ahoc_queries(sample_ids):
    acs = ahocorasick.Automaton()
    #need to add "," and ":" to make sure we dont match coverage counts or within other IDs
    [acs.add_word("," + sid + ":", sid) for sid in sample_ids]
    acs.make_automaton()
    return acs