Exemple #1
0
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0 or line.startswith(
                            '#'):  # conllu format. Attardi
                        continue

                    tokens = line.split('\t')
                    if '-' in tokens[0] or '.' in tokens[0]:  # conllu. Attardi
                        continue
                    for char in tokens[1]:
                        char_alphabet.add(char)

                    word = DIGIT_RE.sub(
                        "0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word not in vocab_set and (word in embedd_dict or
                                                  word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)
Exemple #2
0
    def getNext(self, normalize_digits=True):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            line = line.strip()
            lines.append(line.split(' '))
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        words = []
        char_seqs = []
        postags = []
        chunk_tags = []
        ner_tags = []

        for tokens in lines:
            if '-' in tokens[0] or '.' in tokens[0]:  # conllu clitics. Attardi
                continue
            chars = []
            for char in tokens[1]:
                chars.append(char)
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
            char_seqs.append(chars)

            word = DIGIT_RE.sub("0",
                                tokens[1]) if normalize_digits else tokens[1]
            pos = tokens[2]
            chunk = tokens[3]
            ner = tokens[4]

            words.append(word)
            postags.append(pos)
            chunk_tags.append(chunk)
            ner_tags.append(ner)

        return NERInstance(Sentence(words, char_seqs), postags, chunk_tags,
                           ner_tags)
Exemple #3
0
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            sentences = parse(open(data_path, 'r').read())
            for sentence in sentences:
                for word in sentence:
                    form = word['form']
                    pos = word['upostag']
                    type = word['deprel']

                    real_word = form.split('_BERT_')[0]
                    for char in real_word:
                        char_alphabet.add(char)
                    form = DIGIT_RE.sub("0",
                                        form) if normalize_digits else form
                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if form not in vocab_set and (form in embedd_dict or
                                                  form.lower() in embedd_dict):
                        vocab_set.add(form)
                        vocab_list.append(form)
Exemple #4
0
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split(' ')
                    word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[2]
                    chunk = tokens[3]
                    ner = tokens[4]

                    pos_alphabet.add(pos)
                    chunk_alphabet.add(chunk)
                    ner_alphabet.add(ner)

                    if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)
Exemple #5
0
def create_alphabets(alphabet_directory,
                     train_path,
                     data_paths=None,
                     max_vocabulary_size=100000,
                     embedd_dict=None,
                     min_occurrence=0,
                     normalize_digits=False):
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            sentences = parse(open(data_path, 'r').read())
            for sentence in sentences:
                for word in sentence:
                    form = word['form']
                    pos = word['upostag']
                    type = word['deprel']

                    real_word = form.split('_BERT_')[0]
                    for char in real_word:
                        char_alphabet.add(char)
                    form = DIGIT_RE.sub("0",
                                        form) if normalize_digits else form
                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if form not in vocab_set and (form in embedd_dict or
                                                  form.lower() in embedd_dict):
                        vocab_set.add(form)
                        vocab_list.append(form)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', defualt_value=True, singleton=False)
    char_alphabet = Alphabet('character', defualt_value=True)
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')
    if not os.path.isdir(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)

        char_alphabet.add(PAD_CHAR)
        pos_alphabet.add(PAD_POS)
        type_alphabet.add(PAD_TYPE)

        char_alphabet.add(ROOT_CHAR)
        pos_alphabet.add(ROOT_POS)
        type_alphabet.add(ROOT_TYPE)

        char_alphabet.add(END_CHAR)
        pos_alphabet.add(END_POS)
        type_alphabet.add(END_TYPE)

        vocab = defaultdict(int)
        sentences = parse(open(train_path, 'r').read())
        for sentence in sentences:
            for word in sentence:
                form = word['form']
                pos = word['upostag']
                type = word['deprel']

                real_word = form.split('_BERT_')[0]
                for char in real_word:
                    char_alphabet.add(char)
                form = DIGIT_RE.sub("0", form) if normalize_digits else form
                vocab[form] += 1
                pos_alphabet.add(pos)
                type_alphabet.add(type)

        # collect singletons
        singletons = set(
            [word for word, count in vocab.items() if count <= min_occurrence])

        # if a singleton is in pretrained embedding dict, set the count to min_occur + c
        if embedd_dict is not None:
            assert isinstance(embedd_dict, OrderedDict)
            for word in vocab.keys():
                if word in embedd_dict or word.lower() in embedd_dict:
                    vocab[word] += min_occurrence

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("Total Singleton Size:  %d" % len(singletons))
        vocab_list = [
            word for word in vocab_list
            if word in _START_VOCAB or vocab[word] > min_occurrence
        ]
        logger.info("Total Vocabulary Size (w.o rare words): %d" %
                    len(vocab_list))

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]

        if data_paths is not None and embedd_dict is not None:
            expand_vocab()

        for word in vocab_list:
            word_alphabet.add(word)
            if word in singletons:
                word_alphabet.add_singleton(word_alphabet.get_index(word))

        word_alphabet.save(alphabet_directory)
        char_alphabet.save(alphabet_directory)
        pos_alphabet.save(alphabet_directory)
        type_alphabet.save(alphabet_directory)
    else:
        word_alphabet.load(alphabet_directory)
        char_alphabet.load(alphabet_directory)
        pos_alphabet.load(alphabet_directory)
        type_alphabet.load(alphabet_directory)

    word_alphabet.close()
    char_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" %
                (word_alphabet.size(), word_alphabet.singleton_size()))
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Type Alphabet Size: %d" % type_alphabet.size())
    return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
    def getNext(self,
                normalize_digits=True,
                symbolic_root=False,
                symbolic_end=False):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            line = line.strip()
            lines.append(line.split('\t'))
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        words = []
        word_ids = []
        char_seqs = []
        char_id_seqs = []
        postags = []
        pos_ids = []
        types = []
        type_ids = []
        heads = []

        if symbolic_root:
            words.append(ROOT)
            word_ids.append(self.__word_alphabet.get_index(ROOT))
            char_seqs.append([
                ROOT_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(ROOT_CHAR),
            ])
            postags.append(ROOT_POS)
            pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
            types.append(ROOT_TYPE)
            type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE))
            heads.append(0)

        for tokens in lines:
            chars = []
            char_ids = []
            for char in tokens[1]:
                chars.append(char)
                char_ids.append(self.__char_alphabet.get_index(char))
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
                char_ids = char_ids[:MAX_CHAR_LENGTH]
            char_seqs.append(chars)
            char_id_seqs.append(char_ids)

            word = DIGIT_RE.sub("0",
                                tokens[1]) if normalize_digits else tokens[1]
            pos = tokens[4]
            head = int(tokens[6])
            type = tokens[7]

            words.append(word)
            word_ids.append(self.__word_alphabet.get_index(word))

            postags.append(pos)
            pos_ids.append(self.__pos_alphabet.get_index(pos))

            types.append(type)
            type_ids.append(self.__type_alphabet.get_index(type))

            heads.append(head)

        if symbolic_end:
            words.append(END)
            word_ids.append(self.__word_alphabet.get_index(END))
            char_seqs.append([
                END_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(END_CHAR),
            ])
            postags.append(END_POS)
            pos_ids.append(self.__pos_alphabet.get_index(END_POS))
            types.append(END_TYPE)
            type_ids.append(self.__type_alphabet.get_index(END_TYPE))
            heads.append(0)

        bert_sent_token = []
        # word id for bert
        one_subword_word_indicator_ids = []
        # subword index
        for word in words:
            word_tokens = self.tokenizer.tokenize(word)
            one_subword_word_indicator_ids.append(len(bert_sent_token) + 1)
            bert_sent_token += word_tokens
        bert_sent_token_ids = self.tokenizer.convert_tokens_to_ids(
            ['[CLS]'] + bert_sent_token + ['[SEP]'])

        return DependencyInstance(
            Sentence(bert_sent_token_ids, one_subword_word_indicator_ids,
                     words, word_ids, char_seqs, char_id_seqs), postags,
            pos_ids, heads, types, type_ids)
    def getNext(self, normalize_digits=True):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            line = line.strip()
            lines.append(line.split(' '))
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        words = []
        word_ids = []
        char_seqs = []
        char_id_seqs = []
        postags = []
        pos_ids = []
        chunk_tags = []
        chunk_ids = []
        ner_tags = []
        ner_ids = []

        for tokens in lines:
            chars = []
            char_ids = []
            for char in tokens[1]:
                chars.append(char)
                char_ids.append(self.__char_alphabet.get_index(char))
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
                char_ids = char_ids[:MAX_CHAR_LENGTH]
            char_seqs.append(chars)
            char_id_seqs.append(char_ids)

            word = DIGIT_RE.sub("0",
                                tokens[1]) if normalize_digits else tokens[1]
            pos = tokens[2]
            chunk = tokens[3]
            ner = tokens[4]

            words.append(word)
            word_ids.append(self.__word_alphabet.get_index(word))

            postags.append(pos)
            pos_ids.append(self.__pos_alphabet.get_index(pos))

            chunk_tags.append(chunk)
            chunk_ids.append(self.__chunk_alphabet.get_index(chunk))

            ner_tags.append(ner)
            ner_ids.append(self.__ner_alphabet.get_index(ner))

        return NERInstance(Sentence(words, word_ids, char_seqs,
                                    char_id_seqs), postags, pos_ids,
                           chunk_tags, chunk_ids, ner_tags, ner_ids)
Exemple #8
0
    def getNext(self,
                normalize_digits=False,
                symbolic_root=False,
                symbolic_end=False):
        if len(self.__sentences) == self.__cur_idx:
            return None
        sentence = self.__sentences[self.__cur_idx]
        self.__cur_idx += 1

        words = []
        word_ids = []
        char_seqs = []
        char_id_seqs = []
        postags = []
        pos_ids = []
        types = []
        type_ids = []
        heads = []

        if symbolic_root:
            words.append(ROOT)
            word_ids.append(self.__word_alphabet.get_index(ROOT))
            char_seqs.append([
                ROOT_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(ROOT_CHAR),
            ])
            postags.append(ROOT_POS)
            pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
            types.append(ROOT_TYPE)
            type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE))
            heads.append(0)

        for word in sentence:
            chars = []
            char_ids = []
            real_word = word['form'].split('_BERT_')[0]
            for char in real_word:
                chars.append(char)
                char_ids.append(self.__char_alphabet.get_index(char))
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
                char_ids = char_ids[:MAX_CHAR_LENGTH]
            char_seqs.append(chars)
            char_id_seqs.append(char_ids)

            form = DIGIT_RE.sub(
                "0", word['form']) if normalize_digits else word['form']
            pos = word['upostag']
            head = word['head']
            type = word['deprel']

            words.append(form)
            word_ids.append(self.__word_alphabet.get_index(form))

            postags.append(pos)
            pos_ids.append(self.__pos_alphabet.get_index(pos))

            types.append(type)
            type_ids.append(self.__type_alphabet.get_index(type))

            heads.append(head)

        if symbolic_end:
            words.append(END)
            word_ids.append(self.__word_alphabet.get_index(END))
            char_seqs.append([
                END_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(END_CHAR),
            ])
            postags.append(END_POS)
            pos_ids.append(self.__pos_alphabet.get_index(END_POS))
            types.append(END_TYPE)
            type_ids.append(self.__type_alphabet.get_index(END_TYPE))
            heads.append(0)

        return DependencyInstance(
            Sentence(words, word_ids, char_seqs, char_id_seqs), postags,
            pos_ids, heads, types, type_ids)
Exemple #9
0
def create_alphabets(alphabet_directory,
                     train_path,
                     data_paths=None,
                     max_vocabulary_size=100000,
                     embedd_dict=None,
                     min_occurrence=1,
                     normalize_digits=True):
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0 or line.startswith(
                            '#'):  # conllu format. Attardi
                        continue

                    tokens = line.split('\t')
                    if '-' in tokens[0] or '.' in tokens[0]:  # conllu. Attardi
                        continue
                    for char in tokens[1]:
                        char_alphabet.add(char)

                    word = DIGIT_RE.sub(
                        "0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word not in vocab_set and (word in embedd_dict or
                                                  word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', defualt_value=True, singleton=True)
    char_alphabet = Alphabet('character', defualt_value=True)
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')
    if not os.path.isdir(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)

        char_alphabet.add(PAD_CHAR)
        pos_alphabet.add(PAD_POS)
        type_alphabet.add(PAD_TYPE)

        char_alphabet.add(ROOT_CHAR)
        pos_alphabet.add(ROOT_POS)
        type_alphabet.add(ROOT_TYPE)

        char_alphabet.add(END_CHAR)
        pos_alphabet.add(END_POS)
        type_alphabet.add(END_TYPE)

        vocab = defaultdict(int)  # Attardi
        with open(train_path, 'r') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0 or line.startswith('#'):  # conllu. Attardi
                    continue

                tokens = line.split('\t')
                if '-' in tokens[0] or '.' in tokens[0]:  # conllu. Attardi
                    continue
                for char in tokens[1]:
                    char_alphabet.add(char)

                word = DIGIT_RE.sub(
                    "0", tokens[1]) if normalize_digits else tokens[1]
                vocab[word] += 1

                pos = tokens[4]
                pos_alphabet.add(pos)

                type = tokens[7]
                type_alphabet.add(type)

        # collect singletons
        singletons = set(
            [word for word, count in vocab.items() if count <= min_occurrence])

        # if a singleton is in pretrained embedding dict, set the count to min_occur + c
        if embedd_dict is not None:
            assert isinstance(embedd_dict, OrderedDict)
            for word in vocab.keys():
                if word in embedd_dict or word.lower() in embedd_dict:
                    vocab[word] += min_occurrence

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("Total Singleton Size:  %d" % len(singletons))
        vocab_list = [
            word for word in vocab_list
            if word in _START_VOCAB or vocab[word] > min_occurrence
        ]
        logger.info("Total Vocabulary Size (w.o rare words): %d" %
                    len(vocab_list))

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]

        if data_paths is not None and embedd_dict is not None:
            expand_vocab()

        for word in vocab_list:
            word_alphabet.add(word)
            if word in singletons:
                word_alphabet.add_singleton(word_alphabet.get_index(word))

        word_alphabet.save(alphabet_directory)
        char_alphabet.save(alphabet_directory)
        pos_alphabet.save(alphabet_directory)
        type_alphabet.save(alphabet_directory)
    else:
        word_alphabet.load(alphabet_directory)
        char_alphabet.load(alphabet_directory)
        pos_alphabet.load(alphabet_directory)
        type_alphabet.load(alphabet_directory)

    word_alphabet.close()
    char_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" %
                (word_alphabet.size(), word_alphabet.singleton_size()))
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Type Alphabet Size: %d" % type_alphabet.size())
    return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
def load_embedding_dict(embedding, embedding_path, normalize_digits=True):
    """
    load word embeddings from file
    :param embedding:
    :param embedding_path:
    :return: embedding dict, embedding dimention, caseless
    """
    print("loading embedding: %s from %s" % (embedding, embedding_path))
    if embedding == 'word2vec':
        # loading word2vec
        word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
        embedd_dim = word2vec.vector_size
        return word2vec, embedd_dim
    elif embedding == 'glove':
        # loading GloVe
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split()
                if embedd_dim < 0:
                    embedd_dim = len(tokens) - 1
                else:
                    assert (embedd_dim + 1 == len(tokens))
                embedd = np.empty([1, embedd_dim], dtype=np.float32)
                embedd[:] = tokens[1:]
                word = DIGIT_RE.sub(
                    "0", tokens[0]) if normalize_digits else tokens[0]
                embedd_dict[word] = embedd
        return embedd_dict, embedd_dim
    elif embedding == 'senna':
        # loading Senna
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split()
                if embedd_dim < 0:
                    embedd_dim = len(tokens) - 1
                else:
                    assert (embedd_dim + 1 == len(tokens))
                embedd = np.empty([1, embedd_dim], dtype=np.float32)
                embedd[:] = tokens[1:]
                word = DIGIT_RE.sub(
                    "0", tokens[0]) if normalize_digits else tokens[0]
                embedd_dict[word] = embedd
        return embedd_dict, embedd_dim
    elif embedding == 'sskip':
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:  # Attardi
            # skip the first line
            file.readline()
            for line in file:
                line = line.strip()
                try:
                    if len(line) == 0:
                        continue

                    tokens = line.split()
                    if len(tokens) < embedd_dim:
                        continue

                    if embedd_dim < 0:
                        embedd_dim = len(tokens) - 1

                    embedd = np.empty([1, embedd_dim], dtype=np.float32)
                    start = len(tokens) - embedd_dim
                    word = ' '.join(tokens[0:start])
                    embedd[:] = tokens[start:]
                    word = DIGIT_RE.sub("0",
                                        word) if normalize_digits else word
                    embedd_dict[word] = embedd
                except UnicodeDecodeError:
                    continue
        return embedd_dict, embedd_dim
    elif embedding == 'polyglot':
        words, embeddings = pickle.load(open(embedding_path, 'rb'),
                                        encoding='latin1')
        _, embedd_dim = embeddings.shape
        embedd_dict = OrderedDict()
        for i, word in enumerate(words):
            embedd = np.empty([1, embedd_dim], dtype=np.float32)
            embedd[:] = embeddings[i, :]
            word = DIGIT_RE.sub("0", word) if normalize_digits else word
            embedd_dict[word] = embedd
        return embedd_dict, embedd_dim

    else:
        raise ValueError(
            "embedding should choose from [word2vec, senna, glove, sskip, polyglot]"
        )
Exemple #11
0
    def getNext(self,
                normalize_digits=True,
                symbolic_root=False,
                symbolic_end=False):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            if not line.startswith('#'):  # Attardi
                line = line.strip()
                tokens = line.split('\t')
                lines.append(tokens)
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        words = []
        char_seqs = []
        lemmas = []
        cpostags = []
        postags = []
        featss = []
        heads = []
        types = []
        depss = []
        miscs = []

        if symbolic_root:
            words.append(ROOT)
            char_seqs.append([ROOT_CHAR])
            lemmas.append(ROOT_LEMMA)
            postags.append(ROOT_POS)
            cpostags.append(ROOT_XPOS)
            featss.append(ROOT_FEATS)
            types.append(ROOT_TYPE)
            heads.append(0)
            depss.append(ROOT_DEPS)
            miscs.append(ROOT_MISC)

        for tokens in lines:
            chars = tokens[1][:MAX_CHAR_LENGTH]
            char_seqs.append(chars)

            word = DIGIT_RE.sub("0",
                                tokens[1]) if normalize_digits else tokens[1]
            lemma = tokens[2]
            cpos = tokens[3]
            pos = tokens[4]
            feats = tokens[5]
            head = int(tokens[6])
            type = tokens[7]
            deps = tokens[8]
            misc = tokens[9]

            words.append(word)
            lemmas.append(lemma)
            postags.append(pos)
            cpostags.append(cpos)
            featss.append(feats)
            heads.append(head)
            types.append(type)
            depss.append(deps)
            miscs.append(misc)

        if symbolic_end:
            words.append(END)
            char_seqs.append([END_CHAR])
            lemmas.append(END_LEMMA)
            cpostags.append(END_XPOS)
            postags.append(END_POS)
            featss.append(END_FEATS)
            heads.append(0)
            types.append(END_TYPE)
            depss.append(END_DEPS)
            miscs.append(END_MISC)

        return SentenceTree(Sentence(words, char_seqs), lemmas, postags,
                            cpostags, featss, heads, types, depss, miscs)
Exemple #12
0
    def getNext(self):
        words = []
        char_seqs = []
        lemmas = []
        upostags = []
        xpostags = []
        featss = []
        heads = []
        types = []
        depss = []
        miscs = []

        for line in self.__source_file:
            if line.strip() == '':  # EOS
                break
            if line.startswith('#'): continue
            tokens = line.split('\t')
            if '-' in tokens[0] or '.' in tokens[0]:  # conllu clitics. Attardi
                continue

            word = DIGIT_RE.sub(
                "0", tokens[1]) if self.normalize_digits else tokens[1]
            # trim to MAX_CHAR_LENGTH
            chars = tokens[1][:MAX_CHAR_LENGTH]
            lemma = tokens[2]
            upos = tokens[3]
            xpos = tokens[4]
            feats = tokens[5]
            head = int(tokens[6])
            type = tokens[7]
            deps = tokens[8]
            misc = tokens[9]

            words.append(word)
            char_seqs.append(chars)
            lemmas.append(lemma)
            upostags.append(upos)
            xpostags.append(xpos)
            featss.append(feats)
            heads.append(head)
            types.append(type)
            depss.append(deps)
            miscs.append(misc)

        if not words:
            return None

        if self.symbolic_root:
            words.insert(0, ROOT)
            char_seqs.insert(0, [ROOT_CHAR])
            lemmas.insert(0, ROOT_LEMMA)
            upostags.insert(0, ROOT_UPOS)
            xpostags.insert(0, ROOT_XPOS)
            heads.insert(0, 0)
            types.insert(0, ROOT_TYPE)
            depss.insert(0, ROOT_DEPS)
            miscs.insert(0, ROOT_MISC)

        if self.symbolic_end:
            words.append(END)
            char_seqs.append([END_CHAR])
            lemmas.append(END_LEMMA)
            upostags.append(END_UPOS)
            xpostags.append(END_XPOS)
            featss.append(END_FEATS)
            heads.append(0)
            types.append(END_TYPE)
            depss.append(END_DEPS)
            miscs.append(END_MISC)

        return SentenceTree(Sentence(words, char_seqs), lemmas, upostags,
                            xpostags, featss, heads, types, depss, miscs)
Exemple #13
0
    def getNext(self,
                normalize_digits=True,
                symbolic_root=False,
                symbolic_end=False):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            if not line.startswith('#'):  # Attardi
                line = line.strip()
                tokens = line.split('\t')
                if not '-' in tokens[0] and not '.' in tokens[
                        0]:  # conllu. Attardi
                    lines.append(tokens)
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        words = []
        word_ids = []
        char_seqs = []
        char_id_seqs = []
        postags = []
        pos_ids = []
        types = []
        type_ids = []
        heads = []

        if symbolic_root:
            words.append(ROOT)
            word_ids.append(self.__word_alphabet.get_index(ROOT))
            char_seqs.append([
                ROOT_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(ROOT_CHAR),
            ])
            postags.append(ROOT_POS)
            pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
            types.append(ROOT_TYPE)
            type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE))
            heads.append(0)

        for tokens in lines:
            chars = []
            char_ids = []
            for char in tokens[1]:
                chars.append(char)
                char_ids.append(self.__char_alphabet.get_index(char))
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
                char_ids = char_ids[:MAX_CHAR_LENGTH]
            char_seqs.append(chars)
            char_id_seqs.append(char_ids)

            word = DIGIT_RE.sub("0",
                                tokens[1]) if normalize_digits else tokens[1]
            pos = tokens[4]
            head = int(tokens[6])
            type = tokens[7]

            words.append(word)
            word_ids.append(self.__word_alphabet.get_index(word))

            postags.append(pos)
            pos_ids.append(self.__pos_alphabet.get_index(pos))

            types.append(type)
            type_ids.append(self.__type_alphabet.get_index(type))

            heads.append(head)

        if symbolic_end:
            words.append(END)
            word_ids.append(self.__word_alphabet.get_index(END))
            char_seqs.append([
                END_CHAR,
            ])
            char_id_seqs.append([
                self.__char_alphabet.get_index(END_CHAR),
            ])
            postags.append(END_POS)
            pos_ids.append(self.__pos_alphabet.get_index(END_POS))
            types.append(END_TYPE)
            type_ids.append(self.__type_alphabet.get_index(END_TYPE))
            heads.append(0)

        return DependencyInstance(
            Sentence(words, word_ids, char_seqs, char_id_seqs), postags,
            pos_ids, heads, types, type_ids)
Exemple #14
0
def load_embedding_dict(embedding, embedding_path, normalize_digits=False, word2index_path=''):
    """
    load word embeddings from file
    :param embedding:
    :param embedding_path:
    :return: embedding dict, embedding dimention, caseless
    """
    print("loading embedding: %s from %s" % (embedding, embedding_path))
    if embedding == 'word2vec':
        # loading word2vec
        word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
        embedd_dim = word2vec.vector_size
        return word2vec, embedd_dim
    elif embedding == 'glove':
        # loading GloVe
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split()
                if embedd_dim < 0:
                    embedd_dim = len(tokens) - 1
                else:
                    assert (embedd_dim + 1 == len(tokens))
                embedd = np.empty([1, embedd_dim], dtype=np.float32)
                embedd[:] = tokens[1:]
                word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
                embedd_dict[word] = embedd
        return embedd_dict, embedd_dim
    elif embedding == 'senna':
        # loading Senna
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split()
                if embedd_dim < 0:
                    embedd_dim = len(tokens) - 1
                else:
                    assert (embedd_dim + 1 == len(tokens))
                embedd = np.empty([1, embedd_dim], dtype=np.float32)
                embedd[:] = tokens[1:]
                word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
                embedd_dict[word] = embedd
        return embedd_dict, embedd_dim
    elif embedding == 'sskip':
        embedd_dim = -1
        embedd_dict = OrderedDict()
        with gzip.open(embedding_path, 'rt') as file:
            # skip the first line
            file.readline()
            for line in file:
                line = line.strip()
                try:
                    if len(line) == 0:
                        continue

                    tokens = line.split()
                    if len(tokens) < embedd_dim:
                        continue

                    if embedd_dim < 0:
                        embedd_dim = len(tokens) - 1

                    embedd = np.empty([1, embedd_dim], dtype=np.float32)
                    start = len(tokens) - embedd_dim
                    word = ' '.join(tokens[0:start])
                    embedd[:] = tokens[start:]
                    word = DIGIT_RE.sub("0", word) if normalize_digits else word
                    embedd_dict[word] = embedd
                except UnicodeDecodeError:
                    continue
        return embedd_dict, embedd_dim
    elif embedding == 'polyglot':
        words, embeddings = pickle.load(open(embedding_path, 'rb'), encoding='latin1')
        _, embedd_dim = embeddings.shape
        embedd_dict = OrderedDict()
        for i, word in enumerate(words):
            embedd = np.empty([1, embedd_dim], dtype=np.float32)
            embedd[:] = embeddings[i, :]
            word = DIGIT_RE.sub("0", word) if normalize_digits else word
            embedd_dict[word] = embedd
        return embedd_dict, embedd_dim
    elif embedding == 'fasttext':
        fin = io.open(embedding_path, 'r', encoding='utf-8', newline='\n', errors='ignore')
        n, d = map(int, fin.readline().split())
        embedd_dict = OrderedDict()
        for line in fin:
            tokens = line.rstrip().split(' ')
            embedd_dict[tokens[0]] = list(map(float, tokens[1:]))
        return embedd_dict, 300
    elif embedding == 'bert':
        assert word2index_path != ''
        with open(word2index_path, 'r') as file:
            word2id = json.load(file)
        embedd_dict = OrderedDict()
        for key in word2id.keys():
            embedd_dict[key] = np.load(embedding_path+'/'+str(word2id[key])+'.npy').tolist()
        print(len(embedd_dict))
        return embedd_dict, 768
    else:
        raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]")