def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=1, normalize_digits=True): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split(' ') word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] pos = tokens[1] chunk = tokens[2] ner = tokens[3] pos_alphabet.add(pos) chunk_alphabet.add(chunk) ner_alphabet.add(ner) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', default_value=True, singleton=True) char_alphabet = Alphabet('character', default_value=True) pos_alphabet = Alphabet('pos') chunk_alphabet = Alphabet('chunk') ner_alphabet = Alphabet('ner') if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) char_alphabet.add(PAD_CHAR) pos_alphabet.add(PAD_POS) chunk_alphabet.add(PAD_CHUNK) ner_alphabet.add(PAD_NER) vocab = defaultdict(int) with open(train_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split(' ') for char in tokens[0]: char_alphabet.add(char) word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] vocab[word] += 1 pos = tokens[1] pos_alphabet.add(pos) chunk = tokens[2] chunk_alphabet.add(chunk) ner = tokens[3] ner_alphabet.add(ner) # collect singletons singletons = set([word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: word_alphabet.add(word) if word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) word_alphabet.save(alphabet_directory) char_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) chunk_alphabet.save(alphabet_directory) ner_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) char_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) chunk_alphabet.load(alphabet_directory) ner_alphabet.load(alphabet_directory) word_alphabet.close() char_alphabet.close() pos_alphabet.close() chunk_alphabet.close() ner_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) return word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=1, normalize_digits=True): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split('\t')[0].split(' ') for token in tokens: word = DIGIT_RE.sub("0", token) if normalize_digits else token if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', singleton=True) if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) vocab = defaultdict(int) with open(train_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split('\t')[0].split(' ') for token in tokens: word = DIGIT_RE.sub("0", token) if normalize_digits else token vocab[word] += 1 # collect singletons singletons = set([word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) multi_vocab = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(multi_vocab)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: if word in multi_vocab: word_alphabet.add(word) elif word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) else: raise ValueError("Error word: " + word) refiner = UNKRefiner(0, word_alphabet) # TODO fix the pos here for word in singletons: unk_signature = refiner.refine(word, 0) word_alphabet.add(unk_signature) word_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) word_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) return word_alphabet