コード例 #1
0
class WordList(object):
    def __init__(self, word_file):
        # TODO: Check input file exists, is readable, valid, etc
        words = []
        with open(word_file) as input_file:
            for word in input_file:
                words.append(word.lower().strip())
        self.trie = Trie(words)

    def contains_word(self, word):
        """
        Check whether a word exists in the list.
        
        :param word: An ASCII, lowercase string to check for.
        :return: True if the word is in the word list, false if it is not.
        """
        # TODO: Raise errors if the word is None, isn't ASCII or lowercase, etc
        return word in self.trie

    def contains_prefix(self, prefix):
        """
        Check list for words that begin with the supplied prefix
        
        :param prefix: An ASCII, lowercase string to check as a prefix
        :return: True if this key is a prefix for some other word or words in 
        the list. Note that this method will return False if the word is in the
        list but is not a prefix of any other word.
        """
        # TODO: Raise errors if prefix is None, isn't ASCII or lowercase, etc
        return len(self.trie.keys(prefix)) > 1
コード例 #2
0
def rebuild_database() -> None:
    """Rebuild the search database."""
    global database
    LOGGER.info('Updating search database...')
    # Clear and reset.
    word_to_ids.clear()

    for item in UI.item_list.values():
        for subtype_ind in item.visual_subtypes:
            for tag in item.get_tags(subtype_ind):
                for word in tag.split():
                    word_to_ids[word.casefold()].add((item.id, subtype_ind))
    database = Trie(word_to_ids.keys())
    LOGGER.debug('Tags: {}', database.keys())
    _type_cback()
コード例 #3
0
    def build(db, entity_db, min_word_count, min_entity_count, white_list, pool_size, chunk_size):
        word_counter = Counter()
        entity_counter = Counter()

        if white_list is not None:
            white_list = json.load(open(white_list, 'r'))
        else:
            white_list = None

        tokenizer = RegexpTokenizer()

        with click.progressbar(db.keys()) as bar:
            for title in bar:
                obj = db[title]
                text = obj['text']
                tokens = tokenizer.tokenize(text)

                word_counter.update(t.text.lower() for t in tokens)

                for (_, title, _) in obj['links']:
                    title = entity_db.resolve_redirect(title)
                    entity_counter[title] += 1

        word_dict = Trie([w.lower() for (w, c) in word_counter.items()
                          if c >= min_word_count])

        if white_list is None:
            entity_dict = Trie([e.lower() for (e, c) in entity_counter.items()
                            if c >= min_entity_count])
        else:
            entity_dict = Trie([e.lower() for (e, c) in entity_counter.items()
                            if c >= min_entity_count]+white_list)



        entities = []
        entities_dict = Trie(entities + entity_dict.keys())

        return Vocab(word_dict, entities_dict)
コード例 #4
0
class NWD(object):
    """New Word Detection

    Main class in this package

    Parameters
    ----------
    max_len : int
        max length of ngram
    min_freq : int
        min frequency threshold
    min_pmi : float
        min pmi threshold
    min_entropy : float
        min entropy threshold
    cut : bool, optional (default=True)
        Whether cut sequence to word or not
    tokenizer : str, optional (default='jieba')
        Indicate which tokenizer to use
    norm_pmi : bool, optional (default=False)
        Whether normalize pmi or not
    """
    def __init__(self,
                 max_len,
                 min_freq,
                 min_pmi,
                 min_entropy,
                 cut=True,
                 tokenizer='jieba',
                 norm_pmi=False):
        self.max_len = max_len
        self.min_freq = min_freq
        self.min_pmi = min_pmi
        self.min_entropy = min_entropy
        self.cut = cut
        self.norm_pmi = norm_pmi

        # Initialize dictionary to build trie
        self.trie = defaultdict(int)
        self.rev_trie = defaultdict(int)
        self.len = 0

        # Build existing dictionary based on trie structure
        sistring = set()
        if 'jieba_dict_path' in config['DEFAULT'] and os.path.isfile(
                config['DEFAULT']['jieba_dict_path']):
            sistring = get_sistring(config['DEFAULT']['jieba_dict_path'])
        if 'user_dict_path' in config['DEFAULT'] and os.path.isfile(
                config['DEFAULT']['user_dict_path']):
            sistring = get_sistring(config['DEFAULT']['user_dict_path'],
                                    sistring)
        self.dict = Trie(sistring)
        # Get blacklist
        self.blacklist = set()
        if 'blacklist_path' in config['DEFAULT'] and os.path.isfile(
                config['DEFAULT']['blacklist_path']):
            self.blacklist = get_dict(config['DEFAULT']['blacklist_path'])

        if cut:
            if tokenizer == 'jieba':
                self.tokenizer = Jieba()
            else:
                raise ValueError(f'Unknown tokenizer {tokenizer}')

    def fit(self, docs):
        """Fit model according to documents

        Returns
        -------
        None
        """
        check_docs(docs)

        # Text preprocessing
        docs = self.preprocess_docs(docs)

        # Build trie tree
        self.build_tree(docs)

    def detect(self, docs):
        """Detect new words from documents

        Returns
        -------
        new_words : list
        """
        # Cut doc to a list of words
        if self.cut:
            docs = self.cut_docs(docs)

        # Get candidate words
        cand_words, word2doc = self.get_candidate_words(docs)

        new_words = []
        for cand_word in tqdm(cand_words):
            # `cand_word` is tuple
            freq = self.get_freq(cand_word)
            if freq > self.min_freq:
                pmi_score = self.get_pmi(cand_word)
                if pmi_score > self.min_pmi:
                    entropy_score = self.get_entropy(cand_word)
                    if entropy_score > self.min_entropy:
                        cand_word_str = ''.join(cand_word)
                        new_word = (cand_word_str, cand_word,
                                    len(cand_word_str), freq, pmi_score,
                                    entropy_score, word2doc[cand_word])
                        new_words.append(new_word)

        return new_words

    def fit_detect(self, docs):
        self.fit(docs)
        return self.detect(docs)

    def test(self, docs, options):
        """Testing interface

        Parameters
        ----------
        docs : list of str
        options : list of PreStr class methods
        """
        check_docs(docs)
        if self.cut:
            # Cut doc to a list of words
            docs = self.cut_docs(docs)

        class_methods = PreStr.__dict__  # Get all class methods in PreStr class
        methods = []
        for option in options:
            if options[option]:
                methods.append(class_methods[option])
        for i, doc in enumerate(docs):
            docs[i] = PreStr(doc).pipeline(methods)
        return docs

    def cut_docs(self, docs):
        for i, doc in enumerate(docs):
            docs[i] = [word for word in self.tokenizer.cut(doc)]
        return docs

    def preprocess_docs(self, docs):
        for i, doc in enumerate(docs):
            docs[i] = PreStr(doc).sub_url().sub_punc().agg_sub_symbol()
        return docs

    def build_tree(self, docs):
        """Build trie tree and reverse trie tree
        """
        for doc in docs:
            for sent in get_sents_from_doc(doc):
                rev_sent = sent[::-1]

                # PAT tree build on semi-infinite string
                for i in range(len(sent)):
                    self.trie[sent[i:]] += 1
                    self.rev_trie[rev_sent[i:]] += 1
                self.len += len(sent)

        # Build real trie tree
        self.trie = BTrie().build(self.trie)
        self.rev_trie = BTrie().build(self.rev_trie)

    def get_candidate_words(self, docs):
        cand_words = set()
        word2doc = defaultdict(list)  # Store word appear in which documents
        for i, doc in enumerate(docs):
            doc_set = set()
            for sent in get_sents_from_doc(doc):
                doc_set |= set([
                    ngram for ngram in everygrams(
                        sent, min_len=2, max_len=self.max_len)
                ])
            cand_words |= doc_set
            for ngram in doc_set:
                word2doc[ngram].append(i)

        # Filter candidiate words based on rules
        cand_words = filter(
            self.filter_word,
            cand_words)  # `filter` function returns a generator
        cand_words = list(cand_words)

        return cand_words, word2doc

    def get_freq(self, word):
        """Get word frequency

        Parameters
        ----------
        word : str

        Returns
        -------
        freq : int
        """
        return sum(self.trie.items(''.join(word)).values())

    def get_pmi(self, word):
        """Get word pmi

        Parameters
        ----------
        word : tuple of str

        Returns
        -------
        pmi : float
        """
        # Get probability of xy, x and y
        word_x = ''.join(word[:-1])
        word_y = ''.join(word[1:])
        word = ''.join(word)

        xy = sum(self.trie.items(word).values()) / self.len
        x = sum(self.trie.items(word_x).values()) / self.len
        y = sum(self.trie.items(word_y).values()) / self.len
        if self.norm_pmi:
            pmi_score = npmi(xy, x, y)
        else:
            pmi_score = pmi(xy, x, y)

        return pmi_score

    def get_entropy(self, word):
        """Get word entropy

        Parameters
        ----------
        word : str

        Returns
        -------
        entropy : float
        """
        right_neighbors = defaultdict(int)
        left_neighbors = defaultdict(int)
        word = ''.join(word)
        rev_word = word[::-1]

        # Get sentence with word prefix and find neighbors besides the word
        # If sentence is word itself, use `SUB_SYMBOL` to represent its neighbor
        for sent, tf in self.trie.items(word).items():
            neighbor = SUB_SYMBOL if sent == word else sent[len(word)]
            right_neighbors[neighbor] += tf

        for sent, tf in self.rev_trie.items(rev_word).items():
            neighbor = SUB_SYMBOL if sent == rev_word else sent[len(rev_word)]
            left_neighbors[neighbor] += tf

        # Transform dict to list and differentiate `SUB_SYMBOL` neighbor
        right_tf = []
        left_tf = []

        for neighbor, tf in right_neighbors.items():
            if neighbor == SUB_SYMBOL:
                right_tf += [1] * tf
            else:
                right_tf.append(tf)

        for neighbor, tf in left_neighbors.items():
            if neighbor == SUB_SYMBOL:
                left_tf += [1] * tf
            else:
                left_tf.append(tf)

        right_entropy_score = entropy(right_tf)
        left_entropy_score = entropy(left_tf)

        return min(right_entropy_score, left_entropy_score)

    def get_word_score(self, word):
        """Get score of word

        Parameters
        ----------
        word : tuple of str

        Returns
        -------
        freq : int
        pmi_score : float
        entropy : float
        """
        freq = self.get_freq(word)
        pmi_score = self.get_pmi(word)
        entropy_score = self.get_entropy(word)

        return freq, pmi_score, entropy_score

    def filter_word(self, word):
        word = ''.join(word)
        if re.match(r'^(.)\1*$', word):  # Remove word with all same character
            return False
        elif re.match(rf'^({RE_PREP}).*|.*({RE_PREP})$',
                      word):  # Remove word start or end with preposition
            return False
        elif re.match(rf'^({RE_STOPWORDS}).*|.*({RE_STOPWORDS})$',
                      word):  # Remove word start or end with stopwords
            return False
        elif re.match(r'.*年?\d*月\d*日?', word):  # Remove date
            return False
        elif self.dict.keys(word):  # Remove word in existing dictionary
            return False
        elif word in self.blacklist:  # Remove word in blacklist
            return False
        elif re.match(r'^[\u4E00-\u9FD5a-zA-Z]+$', word):
            return True
        else:
            return False

    def merge(self, nwd):
        if type(nwd) != type(self):
            raise TypeError(f'Type {type(nwd)} not equal to {type(self)}')

        self.trie.merge(nwd.trie)
        self.rev_trie.merge(nwd.rev_trie)
        self.len += nwd.len

        return self

    def save(self, file_path):
        with open(file_path, 'wb') as file:
            pickle.dump(self, file)