Beispiel #1
0
    def getNext(self, normalize_digits=True):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        line = line.strip().split('\t')
        label = line[1]
        words = line[0].strip().split(' ')
        word_ids = []
        for pos, word in enumerate(words):
            word = DIGIT_RE.sub("0", word) if normalize_digits else word
            if self.refiner is not None:
                word_id = self.__word_alphabet.get_index(word)
                if word_id < 0:
                    unk_signature = self.refiner.refine(word, pos)
                    self.__word_alphabet.add(unk_signature)
                    word_ids.append(
                        self.__word_alphabet.get_index(unk_signature))
                else:
                    word_ids.append(word_id)
            else:
                word_ids.append(self.__word_alphabet.get_index(word))

        label_id = int(label)

        return NERInstance(Sentence(words, word_ids, None, None), label,
                           label_id, None, None, None, None)
Beispiel #2
0
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue
                    tokens = line.split('\t')[0].split(' ')

                    for token in tokens:
                        word = DIGIT_RE.sub("0", token) if normalize_digits else token

                        if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                            vocab_set.add(word)
                            vocab_list.append(word)
Beispiel #3
0
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split(' ')
                    word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
                    pos = tokens[1]
                    chunk = tokens[2]
                    ner = tokens[3]

                    pos_alphabet.add(pos)
                    chunk_alphabet.add(chunk)
                    ner_alphabet.add(ner)

                    if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)
Beispiel #4
0
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split('\t')
                    for char in tokens[1]:
                        char_alphabet.add(char)

                    word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)
Beispiel #5
0
    def getNext(self,
                normalize_digits=True,
                symbolic_root=False,
                symbolic_end=False):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            line = line.strip()
            lines.append(line.split('\t'))
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        words = []
        word_ids = []
        char_seqs = []
        char_id_seqs = []
        postags = []
        pos_ids = []
        types = []
        type_ids = []
        heads = []

        if symbolic_root:
            words.append(ROOT)
            word_ids.append(self.__word_alphabet.get_index(ROOT))
            char_seqs.append([
                ROOT_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(ROOT_CHAR),
            ])
            postags.append(ROOT_POS)
            pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
            types.append(ROOT_TYPE)
            type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE))
            heads.append(0)

        for tokens in lines:
            chars = []
            char_ids = []
            for char in tokens[1]:
                chars.append(char)
                char_ids.append(self.__char_alphabet.get_index(char))
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
                char_ids = char_ids[:MAX_CHAR_LENGTH]
            char_seqs.append(chars)
            char_id_seqs.append(char_ids)

            word = DIGIT_RE.sub("0",
                                tokens[1]) if normalize_digits else tokens[1]
            pos = tokens[4]
            head = int(tokens[6])
            type = tokens[7]

            words.append(word)
            word_ids.append(self.__word_alphabet.get_index(word))

            postags.append(pos)
            pos_ids.append(self.__pos_alphabet.get_index(pos))

            types.append(type)
            type_ids.append(self.__type_alphabet.get_index(type))

            heads.append(head)

        if symbolic_end:
            words.append(END)
            word_ids.append(self.__word_alphabet.get_index(END))
            char_seqs.append([
                END_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(END_CHAR),
            ])
            postags.append(END_POS)
            pos_ids.append(self.__pos_alphabet.get_index(END_POS))
            types.append(END_TYPE)
            type_ids.append(self.__type_alphabet.get_index(END_TYPE))
            heads.append(0)

        for position, word in enumerate(words):
            # TODO here the position is not correct
            if self.refine_unk:
                word_idx = self.__word_alphabet.get_index(word)
                if word_idx < 0:
                    unk_signature = self.refiner.refine(word, position)
                    word_ids[position] = self.__word_alphabet.get_index(
                        unk_signature)

        return DependencyInstance(
            Sentence(words, word_ids, char_seqs, char_id_seqs), postags,
            pos_ids, heads, types, type_ids)
Beispiel #6
0
    def getNext(self, normalize_digits=True):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            line = line.strip()
            lines.append(line.split(' '))
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        words = []
        word_ids = []
        char_seqs = []
        char_id_seqs = []
        postags = []
        pos_ids = []
        chunk_tags = []
        chunk_ids = []
        ner_tags = []
        ner_ids = []

        for tokens in lines:
            chars = []
            char_ids = []
            for char in tokens[0]:
                chars.append(char)
                char_ids.append(self.__char_alphabet.get_index(char))
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
                char_ids = char_ids[:MAX_CHAR_LENGTH]
            char_seqs.append(chars)
            char_id_seqs.append(char_ids)

            word = DIGIT_RE.sub("0",
                                tokens[0]) if normalize_digits else tokens[0]
            pos = tokens[1]
            chunk = tokens[2]
            ner = tokens[3]

            words.append(word)
            word_ids.append(self.__word_alphabet.get_index(word))

            postags.append(pos)
            pos_ids.append(self.__pos_alphabet.get_index(pos))

            chunk_tags.append(chunk)
            chunk_ids.append(self.__chunk_alphabet.get_index(chunk))

            ner_tags.append(ner)
            ner_ids.append(self.__ner_alphabet.get_index(ner))

        return NERInstance(Sentence(words, word_ids, char_seqs,
                                    char_id_seqs), postags, pos_ids,
                           chunk_tags, chunk_ids, ner_tags, ner_ids)
Beispiel #7
0
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
                     min_occurrence=1, normalize_digits=True):

    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split(' ')
                    word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
                    pos = tokens[1]
                    chunk = tokens[2]
                    ner = tokens[3]

                    pos_alphabet.add(pos)
                    chunk_alphabet.add(chunk)
                    ner_alphabet.add(ner)

                    if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', default_value=True, singleton=True)
    char_alphabet = Alphabet('character', default_value=True)
    pos_alphabet = Alphabet('pos')
    chunk_alphabet = Alphabet('chunk')
    ner_alphabet = Alphabet('ner')

    if not os.path.isdir(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)

        char_alphabet.add(PAD_CHAR)
        pos_alphabet.add(PAD_POS)
        chunk_alphabet.add(PAD_CHUNK)
        ner_alphabet.add(PAD_NER)

        vocab = defaultdict(int)
        with open(train_path, 'r') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split(' ')
                for char in tokens[0]:
                    char_alphabet.add(char)

                word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
                vocab[word] += 1

                pos = tokens[1]
                pos_alphabet.add(pos)

                chunk = tokens[2]
                chunk_alphabet.add(chunk)

                ner = tokens[3]
                ner_alphabet.add(ner)

        # collect singletons
        singletons = set([word for word, count in vocab.items() if count <= min_occurrence])

        # if a singleton is in pretrained embedding dict, set the count to min_occur + c
        if embedd_dict is not None:
            assert isinstance(embedd_dict, OrderedDict)
            for word in vocab.keys():
                if word in embedd_dict or word.lower() in embedd_dict:
                    vocab[word] += min_occurrence

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("Total Singleton Size:  %d" % len(singletons))
        vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
        logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list))

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]

        if data_paths is not None and embedd_dict is not None:
            expand_vocab()

        for word in vocab_list:
            word_alphabet.add(word)
            if word in singletons:
                word_alphabet.add_singleton(word_alphabet.get_index(word))

        word_alphabet.save(alphabet_directory)
        char_alphabet.save(alphabet_directory)
        pos_alphabet.save(alphabet_directory)
        chunk_alphabet.save(alphabet_directory)
        ner_alphabet.save(alphabet_directory)
    else:
        word_alphabet.load(alphabet_directory)
        char_alphabet.load(alphabet_directory)
        pos_alphabet.load(alphabet_directory)
        chunk_alphabet.load(alphabet_directory)
        ner_alphabet.load(alphabet_directory)

    word_alphabet.close()
    char_alphabet.close()
    pos_alphabet.close()
    chunk_alphabet.close()
    ner_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())
    return word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet
Beispiel #8
0
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
                     min_occurrence=1, normalize_digits=True, unk_rank=5):

    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split('\t')
                    for char in tokens[1]:
                        char_alphabet.add(char)

                    word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', singleton=True)
    char_alphabet = Alphabet('character')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')

    logger.info("Creating Alphabets: %s" % alphabet_directory)

    char_alphabet.add(PAD_CHAR)
    pos_alphabet.add(PAD_POS)
    type_alphabet.add(PAD_TYPE)

    char_alphabet.add(ROOT_CHAR)
    pos_alphabet.add(ROOT_POS)
    type_alphabet.add(ROOT_TYPE)

    char_alphabet.add(END_CHAR)
    pos_alphabet.add(END_POS)
    type_alphabet.add(END_TYPE)

    vocab = defaultdict(int)

    # here we use the list to save every word and position
    word_collect = []
    with open(train_path, 'r') as file:
        words = []
        position = 0
        for line in file:
            line = line.strip()
            if len(line) == 0:
                position = 0
                word_collect.append(words)
                words = []
                continue

            tokens = line.split('\t')
            for char in tokens[1]:
                char_alphabet.add(char)

            word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
            vocab[word] += 1
            words.append((word, position))
            position += 1

            pos = tokens[4]
            pos_alphabet.add(pos)

            type = tokens[7]
            type_alphabet.add(type)

    # collect singletons
    singletons = set([word for word, count in vocab.items() if count <= min_occurrence])

    # if a singleton is in pretrained embedding dict, set the count to min_occur + c
    if embedd_dict is not None:
        assert isinstance(embedd_dict, OrderedDict)
        for word in vocab.keys():
            if word in embedd_dict or word.lower() in embedd_dict:
                vocab[word] += min_occurrence

    vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
    logger.info("Total Vocabulary Size: %d" % len(vocab_list))
    logger.info("Total Singleton Size:  %d" % len(singletons))
    multi_vocab = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
    logger.info("Total Vocabulary Size (w.o rare words): %d" % len(multi_vocab))

    if len(vocab_list) > max_vocabulary_size:
        vocab_list = vocab_list[:max_vocabulary_size]

    if data_paths is not None and embedd_dict is not None:
        expand_vocab()

    for word in vocab_list:
        if word in multi_vocab:
            word_alphabet.add(word)
        elif word in singletons:
            word_alphabet.add_singleton(word_alphabet.get_index(word))
        else:
            raise ValueError("Error word: " + word)

    # unk refiner
    unk_refiner = UNKRefiner(level=unk_rank, alphabet=word_alphabet)
    for words in word_collect:
        for word, position in words:
            if word in singletons:
                unk_signature = unk_refiner.refine(word, position)
                word_alphabet.add(unk_signature)

    word_alphabet.close()
    char_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Type Alphabet Size: %d" % type_alphabet.size())
    return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
Beispiel #9
0
def load_embedding_dict(embedding, embedding_path, normalize_digits=True):
    """
    load word embeddings from file
    :param embedding:
    :param embedding_path:
    :return: embedding dict, embedding dimention, caseless
    """
    print("loading embedding: %s from %s" % (embedding, embedding_path))
    if embedding == 'word2vec':
        # loading word2vec
        word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
        embedd_dim = word2vec.vector_size
        return word2vec, embedd_dim
    elif embedding == 'glove':
        # loading GloVe
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt', encoding='utf8') as file:
            for line1 in file:
                line = line1.strip()
                if len(line) == 0:
                    continue

                tokens = line.split(' ')
                if embedd_dim < 0:
                    embedd_dim = len(tokens) - 1
                elif embedd_dim + 1 != len(tokens):
                    continue
                    # assert (embedd_dim + 1 == len(tokens))
                embedd = np.empty([1, embedd_dim], dtype=np.float32)
                embedd[:] = tokens[1:]
                word = DIGIT_RE.sub(
                    "0", tokens[0]) if normalize_digits else tokens[0]
                embedd_dict[word] = embedd
        return embedd_dict, embedd_dim
    elif embedding == 'senna':
        # loading Senna
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split()
                if embedd_dim < 0:
                    embedd_dim = len(tokens) - 1
                else:
                    assert (embedd_dim + 1 == len(tokens))
                embedd = np.empty([1, embedd_dim], dtype=np.float32)
                embedd[:] = tokens[1:]
                word = DIGIT_RE.sub(
                    "0", tokens[0]) if normalize_digits else tokens[0]
                embedd_dict[word] = embedd
        return embedd_dict, embedd_dim
    elif embedding == 'sskip':
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:
            # skip the first line
            file.readline()
            for line in file:
                line = line.strip()
                try:
                    if len(line) == 0:
                        continue

                    tokens = line.split()
                    if len(tokens) < embedd_dim:
                        continue

                    if embedd_dim < 0:
                        embedd_dim = len(tokens) - 1

                    embedd = np.empty([1, embedd_dim], dtype=np.float32)
                    start = len(tokens) - embedd_dim
                    word = ' '.join(tokens[0:start])
                    embedd[:] = tokens[start:]
                    word = DIGIT_RE.sub("0",
                                        word) if normalize_digits else word
                    embedd_dict[word] = embedd
                except UnicodeDecodeError:
                    continue
        return embedd_dict, embedd_dim
    elif embedding == 'polyglot':
        words, embeddings = pickle.load(open(embedding_path, 'rb'),
                                        encoding='latin1')
        _, embedd_dim = embeddings.shape
        embedd_dict = OrderedDict()
        for i, word in enumerate(words):
            embedd = np.empty([1, embedd_dim], dtype=np.float32)
            embedd[:] = embeddings[i, :]
            word = DIGIT_RE.sub("0", word) if normalize_digits else word
            embedd_dict[word] = embedd
        return embedd_dict, embedd_dim

    else:
        raise ValueError(
            "embedding should choose from [word2vec, senna, glove, sskip, polyglot]"
        )
Beispiel #10
0
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
                     min_occurrence=1, normalize_digits=True):
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue
                    tokens = line.split('\t')[0].split(' ')

                    for token in tokens:
                        word = DIGIT_RE.sub("0", token) if normalize_digits else token

                        if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                            vocab_set.add(word)
                            vocab_list.append(word)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', singleton=True)
    if not os.path.isdir(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)
        vocab = defaultdict(int)
        with open(train_path, 'r') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split('\t')[0].split(' ')
                for token in tokens:
                    word = DIGIT_RE.sub("0", token) if normalize_digits else token
                    vocab[word] += 1

        # collect singletons
        singletons = set([word for word, count in vocab.items() if count <= min_occurrence])

        # if a singleton is in pretrained embedding dict, set the count to min_occur + c
        if embedd_dict is not None:
            assert isinstance(embedd_dict, OrderedDict)
            for word in vocab.keys():
                if word in embedd_dict or word.lower() in embedd_dict:
                    vocab[word] += min_occurrence

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("Total Singleton Size:  %d" % len(singletons))
        multi_vocab = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
        logger.info("Total Vocabulary Size (w.o rare words): %d" % len(multi_vocab))

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]

        if data_paths is not None and embedd_dict is not None:
            expand_vocab()
        for word in vocab_list:
            if word in multi_vocab:
                word_alphabet.add(word)
            elif word in singletons:
                word_alphabet.add_singleton(word_alphabet.get_index(word))
            else:
                raise ValueError("Error word: " + word)
        refiner = UNKRefiner(0, word_alphabet)
        # TODO fix the pos here
        for word in singletons:
            unk_signature = refiner.refine(word, 0)
            word_alphabet.add(unk_signature)

        word_alphabet.save(alphabet_directory)
    else:
        word_alphabet.load(alphabet_directory)

    word_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
    return word_alphabet