def from_dataframe(cls, review_df, cutoff=25): """Instantiate the vectorizer from the dataset dataframe Args: review_df (pandas.DataFrame): the review dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) # Add ratings for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) # Add top words if count > provided count word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def __init__(self, mode, prepocessed, srcVocaThreshold, tgtVocaThreshold, deprelLabelThreshold, printEvery, trainSize, testSize, devSize): if prepocessed: tgtTrain = './data/processed/train.en' actTrain = './data/processed/train.oracle.en' tgtDev = './data/processed/dev.en' actDev = './data/processed/dev.oracle.en' tgtTest = './data/processed/test.en' actTest = './data/processed/test.oracle.en' srcTrain = './data/processed/train.kr' deprelTrain = './data/processed/train.deprel.kr' srcDev = './data/processed/dev.kr' deprelDev = './data/processed/dev.deprel.kr' srcTest = './data/processed/test.kr' deprelTest = './data/processed/test.deprel.kr' else: train_permutation = list(range(0, 99999)) random.shuffle(train_permutation) dev_permutation = list(range(0, 10000)) random.shuffle(dev_permutation) print('Parsing target file into plain sentences & actions...') tgtTrain, actTrain = self.conll_to_action('./data/tagged_train.en', trainSize, train_permutation) tgtDev, actDev = self.conll_to_action('./data/tagged_dev.en', devSize, dev_permutation) print( 'Parsing source file into plain sentences & dependency relations...' ) srcTrain, deprelTrain = self.conll_to_deprels( './data/tagged_train.kr', trainSize, train_permutation) srcDev, deprelDev = self.conll_to_deprels('./data/tagged_dev.kr', devSize, dev_permutation) print('Loading processed data...') self.sourceVoc = Vocabulary(srcTrain, srcVocaThreshold, 'lang') self.targetVoc = Vocabulary(tgtTrain, tgtVocaThreshold, 'lang') self.actionVoc = Vocabulary(actTrain, None, 'action') self.deprelVoc = Vocabulary(deprelTrain, deprelLabelThreshold, 'deprel') self.trainData = [] self.devData = [] self.trainData = self.loadCorpus(srcTrain, tgtTrain, actTrain, deprelTrain, self.trainData) self.devData = self.loadCorpus(srcDev, tgtDev, actDev, deprelDev, self.devData) self.printEvery = printEvery print('Loaded...')
def __init__(self, naiveBayesMatrix,beta = -1): naiveBayesMatrix = naiveBayesMatrix.todense() v = 0 #total Vocabulary words for x in range(0,naiveBayesMatrix.shape[0]): v = v + naiveBayesMatrix[x,:].sum() #B = 1/v B = beta if(beta == -1): B = 1/v alphaMinusOne = B #(a-1) #(length of vocab list) vocab = Vocabulary() vocabListLength = vocab.length # (a-1)*(length of vocab list) denominatorStatic = alphaMinusOne * vocabListLength #(count of Xi in Yk) + (a-1) numerator = naiveBayesMatrix + alphaMinusOne #P(Xi|Yk) for x in range(numerator.shape[0]): denominatorDynamic = naiveBayesMatrix[x,:].sum() numerator[x,:] *= (1/(denominatorDynamic + denominatorStatic)) #log2(P(Xi|Yk)) self.mapmatrix = np.log2(numerator)
def getVocabByID(ID): from Vocabulary import Vocabulary try: #theData = "" # this returns a Vocabulary item based on the record ID in the database conn = sqlite3.connect('FinnVocab.db') num2use = str(ID) # debugging: print(num2use) # need to pass a tuple; if you pass a plain string, the query breaks on anything >=10 # see http://stackoverflow.com/questions/4409539/pythonsqlite-the-like-query-with-wildcards#4409584 # for where I found the answer (while working on team project & looking for something totally unrelated, of course! :D ) cursor = conn.execute( "SELECT ID, FINNISH, ENGLISH FROM VOCABULARY WHERE ID = ?;", (num2use, )) # there should be exactly one result returned record = cursor.fetchone() theID = record[0] theFinn = record[1].strip() theEngl = record[2].strip() theData = Vocabulary(theID, theFinn, theEngl) except sqlite3.Error as e: print("Unable to retrieve record number " + str(ID) + ".") traceback.print_exc() # set theData to None so the program can fail gracefully. theData = None finally: # close the connection conn.close() return theData
def build_vocab(json, threshold): """Build a simple vocabulary wrapper.""" counter = Counter() for i, id in enumerate(ids): caption = str(coco.anns[id]['caption']) tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) if (i + 1) % 1000 == 0: print("[{}/{}] Tokenized the captions.".format(i + 1, len(ids))) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def get_model(name='saved_models/model', path=extracted_data_train_dir, size=TRAIN_SIZE): train_vocab = Vocabulary(train_captions, max_vocab_size) wat = [ torch.tensor(x[1:], dtype=torch.int16) for x in train_vocab.encoded_captions ] padded = pad_sequence(wat).permute(1, 0) dataset = MyDataset(enc_captions=padded[:size], image_paths=train_image_paths[:size], data_dir=path + 'vecs/') dataloader = DataLoader(dataset=dataset, batch_size=256, num_workers=0) criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) model = End2End(ENC_INPUT, ENC_OUTPUT, DEC_HID_DIM, DEC_OUTPUT, EMB_DIM, ATTN_DIM, train_vocab, criterion, device) model.load_state_dict(torch.load(name)) optimizer = optim.Adam(model.parameters(), lr=0.0001) # optimizer.lr = 0.001 return model, dataset, dataloader, optimizer
def build_vocabulary(self, threshold): vocabulary = Vocabulary() counter = Counter() for id in self.image_desc: caption = self.image_desc[id] tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize( caption.lower()) if len(tokens) > self.input_maxlen - 2: self.input_maxlen = len(tokens) + 2 counter.update(tokens) for id in self.story_data: temp_in = 0 temp_out = 0 for seq in self.story_data[id]: caption = seq[2] tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize( caption.lower()) counter.update(tokens) temp_out = temp_out + len(tokens) caption_in = self.image_desc[seq[1]] tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize( caption_in.lower()) temp_in = temp_in + len(tokens) counter.update(tokens) if temp_out > self.output_maxlen - 2: self.output_maxlen = temp_out + 2 if temp_in > self.input_maxlen - 2: self.input_maxlen = temp_out + 2 words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocabulary = Vocabulary() vocabulary.add_word('<pad>') vocabulary.add_word('<start>') vocabulary.add_word('<end>') vocabulary.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocabulary.add_word(word) f = open("./Dataset/vocabulary.pkl", "wb") pickle.dump(vocabulary, f) f.close() return vocabulary
def __init__(self, corpus = ""): self.voc = Vocabulary() self.tm = scipy.sparse.dok_matrix((1000,1000), dtype=np.float32) self.add_from_text(corpus) self.start = UnigramLM(self.voc) self.valid = False self.sorted_tokens = []
def loadVocabs(self): """ Reads the suffixVocab files from disk an stores them in dictonaries and arries """ self.vocabs = {} for path in self.vocabPaths: print path self.vocabs[path] = Vocabulary(path)
def from_dataframe(cls, review_df, cutoff=25): review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def __init__(self, image_ids, image_folder_path, mode = 'train', vocab_file = "", vocab_threshold = 5, batch_size = 10): assert mode in ['train', 'val', 'test'] self.mode = mode self.image_folder_path = image_folder_path self.batch_size = batch_size # Get pre-processed objects all_captions_dict = load_obj('captions_dict') captions_dict = { image_id: all_captions_dict[image_id] for image_id in image_ids } # only include selected subset of captions # Obtain sample of training images #self.training_image_ids, captions_dict = get_training_indices(sample_size = sample_size, mode = "balanced_clean") # self.training_image_ids, self.images_path, self.image_id_dict, captions_dict \ # = get_data(image_folder_path, annotations_path, sample_size, data_type) # Set up vocabulary or load from training set if self.mode == 'train': self.vocab = Vocabulary(captions_dict) print('Vocabulary successfully created') elif vocab_file != "": self.vocab = vocab_file self.word2idx = self.vocab.word2idx self.idx2word = self.vocab.idx2word #print('Vocabulary successfully loaded') else: self.vocab = load_obj("vocab") self.word2idx = self.vocab.word2idx self.idx2word = self.vocab.idx2word print('Vocabulary successfully loaded') # Batch_size set to 1 if is test if self.mode == 'test': self.batch_size = 1 # Set up dataset self.im_ids = [] # with duplicates for indexing, i.e. if caption 1-5 all correspond to image 8, the im_ids will be [8,8,8,8,8] self.captions = [] self.images = [] self.captions_len = [] for im_id, captions_list in captions_dict.items(): for item in captions_list: self.im_ids.append(im_id) self.captions.append(item) self.captions_len.append(len(nltk.tokenize.word_tokenize(item))) # Set up paramteres for image feature extraction self.transform = transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)), ])
def __init__(self, collection, doStats=False, postingsFile=False): self.collection = collection self.lexAnalyser = False self.calculateStats = doStats self.vocabulary = Vocabulary() self.postings = DictionaryPostings({}) self.documents = Documents() self.maxFreqInDocs = {} #self.positions = DictionaryPostings({}) if self.calculateStats: self.stats = self.getInitStats()
def generate_vocabulary(counter, threshold): """Generate vocabulary.""" vocab = Vocabulary() # Keep words that have more occurances thatn threshold words = sorted([word for word, cnt in counter.items() if cnt >= threshold]) # Add words to dictionary for i, word in enumerate(words): vocab.addWord(word) return vocab
def build_vocabulary(self, threshold): '''if os.path.exists("./Dataset/vocabulary.pkl"): f = open("./Dataset/vocabulary.pkl","rb") vocabulary = pickle.load(f) return vocabulary''' vocabulary = Vocabulary() counter = Counter() for id in self.image_desc: caption = self.image_desc[id] tokens = nltk.tokenize.word_tokenize(caption.lower()) if len(tokens) > self.max_length - 2: self.max_length = len(tokens) + 2 counter.update(tokens) for annot in self.coco_desc: caption = annot['caption'] tokens = nltk.tokenize.word_tokenize(caption.lower()) if len(tokens) > self.max_length - 2: self.max_length = len(tokens) + 2 counter.update(tokens) words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocabulary = Vocabulary() vocabulary.add_word('<pad>') vocabulary.add_word('<start>') vocabulary.add_word('<end>') vocabulary.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocabulary.add_word(word) f = open("./Dataset/vocabulary.pkl", "wb") pickle.dump(vocabulary, f) f.close() return vocabulary
def build_vocabulary(self, threshold): vocabulary = Vocabulary() counter = Counter() for id in self.image_desc: caption = self.image_desc[id] tokens = nltk.tokenize.word_tokenize(caption.lower()) if len(tokens) > self.max_length - 2: self.max_length = len(tokens) + 2 counter.update(tokens) words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocabulary = Vocabulary() vocabulary.add_word('<pad>') vocabulary.add_word('<start>') vocabulary.add_word('<end>') vocabulary.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocabulary.add_word(word) return vocabulary
def from_dataframe(cls, predictor_df, classifier, cutoff=25): # GLOVE_MODEL """Instantiate the vectorizer from the dataset dataframe Args: predictor_df (pandas.DataFrame): the predictor dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the ReviewVectorizer """ if classifier == 'GloVe': predictor_vocab = SequenceVocabulary() else: predictor_vocab = Vocabulary(add_unk=True) target_vocab = Vocabulary(add_unk=False) max_predictor_length = 0 # Add targets for target in sorted(set(predictor_df.target)): target_vocab.add_token(target) # Add top words if count > provided count word_counts = Counter() for index, row in predictor_df.iterrows(): vector = remove_punctuation(row.predictor) max_predictor_length = max(max_predictor_length, len(vector)) for word in vector: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: predictor_vocab.add_token(word) return cls(predictor_vocab, target_vocab, max_predictor_length) # for CNN
def __init__(self, dim_emb, dim_hid, vocab_file='./data/preprocessed/vocab_file.vocab'): super().__init__() self.vocab = Vocabulary() self.vocab.load(vocab_file=vocab_file) self.embed = torch.nn.Embedding(len(self.vocab), dim_emb) self.rnn1 = torch.nn.LSTM(dim_emb, dim_hid, batch_first=True) self.rnn2 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) # self.rnn3 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) # self.rnn4 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) self.out = torch.nn.Linear(dim_hid, len(self.vocab))
def prepare_training_data(sent_pairs): voc = Vocabulary() sent_pairs_normalized = [] for sent_p in sent_pairs: # normalize incorrect and correct sentence in pair, and append them to normalized sentence pairs incorrect_sent_normalized = normalize_string(sent_p[0]) correct_sent_normalized = normalize_string(sent_p[1]) normalized_sents_pair = ( incorrect_sent_normalized, correct_sent_normalized) sent_pairs_normalized.append(normalized_sents_pair) # add normalized sentence pair to vocabulary voc.add_sentence_pair(normalized_sents_pair) return voc, sent_pairs_normalized
def __init__(self, dim_emb, dim_hid, vocab_file='./data/preprocessed/vocab_file.vocab'): super(Encoder_Decoder, self).__init__() self.vocab = Vocabulary() self.vocab.load(vocab_file=vocab_file) self.dim_hid = dim_hid self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb) # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True) self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) # LSTMの128次元の隠れ層を13次元に変換する全結合層 self.hidden2linear = nn.Linear(dim_hid, len(self.vocab))
def get_average_vector_of_sentence(self, sentences_with_word, vectors2D, vocabulary_model): average_vector = [] for sentence in sentences_with_word: vectors = [] words = Vocabulary(self.language).sentence_to_wordlist(sentence) for word in words: try: vectors.append(vectors2D[vocabulary_model.wv.vocab[word.lower()].index]) except KeyError: continue average_vector.append(np.asarray(vectors).mean(axis=0)) # to take the mean of each column self.average_vector = average_vector return average_vector
def main(word, language, part_of_speech, number_of_clusters): # here we need to get sentences either from txt or db file sentences = get_corpus_from_txt_file(language) # sentences = get_corpus_from_db (language) print("Got sentences for language " + language) vocabulary = Vocabulary(language) print("Created Vocabulary") cluster = Cluster(language, number_of_clusters) words = vocabulary.make_array_of_words_from_sentences(sentences) print("Made arrays for every sentence") throne2vec = vocabulary.build_vocabulary(words) # get the trained model (vocabulary) print("Trained the model") all_word_vectors_matrix_2d = cluster.make_vectors_2D(throne2vec) print("Made matrix with vectors")
def from_dataframe(cls, news_df): """Instantiate the vectorizer from the dataset dataframe Args: news_df (pandas.DataFrame): the target dataset Returns: an instance of the NREVectorizer """ relation_vocab = Vocabulary() for relation in set(news_df.relation): relation_vocab.add_token(relation) seq_vocab = SequenceVocabulary() for sequence in news_df.sequence: word_list = list(jieba.cut(sequence, cut_all=False)) seq_vocab.add_many(word_list) return cls(seq_vocab, relation_vocab)
def test_knowledge_CRUD(self): tag_id = 0 property_tag = SemanticTag("Property", "ssn:Poperty") measurement_property_tag = SemanticTag( "Measurement Property", "ssn:property/MeasurementProperty") relation_tag1 = PredicateSemanticTag("is a", "http://www.w3.org/ns/ssn/#is_a", property_tag, measurement_property_tag) semantic_net = SemanticNet() semantic_net.add_tag(property_tag, tag_id) tag_id += 1 semantic_net.add_tag(measurement_property_tag, tag_id) tag_id += 1 semantic_net.add_predicate(relation_tag1) accuracy_tag = SemanticTag( "Accuracy", "ssn:property/MeasurementProperty/Accuracy") relation_tag2 = PredicateSemanticTag("is a", "http://www.w3.org/ns/ssn/#is_a", measurement_property_tag, accuracy_tag) semantic_net.add_tag(accuracy_tag, tag_id) semantic_net.add_predicate(relation_tag2) vocabulary = Vocabulary(semantic_net, None, None, None, None) knowledge = Knowledge(vocabulary, None, None) kb = KnowledgeBase(knowledge) client = MongoClient() db = client.local collection = db.knowledge_base knowledge_json = jsonpickle.encode(knowledge) print("\n" + knowledge_json + "\n") print(type(knowledge_json)) knowledge_bson = bson.son.SON(json.loads(knowledge_json)) print(type(knowledge_bson)) semantic_net_json = jsonpickle.encode(semantic_net) collection.insert_one(knowledge_bson) knowledge_found = collection.find_one( bson.son.SON(json.loads(knowledge_json))) print("Knowledge retrieved: " + str(knowledge_found))
def from_dataframe(cls, dataset_df, cutoff=c_frequencyCutoff): """ Instantiate the Vectorizer from the dataset dataframe Args: dataset_df (pandas.DataFrame): the tweets dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the TwitterVectorizer """ # instantiate the Vocabulary for text column text_vocabulary = cls._get_text_vocabulary() # instantiate the Vocabulary for target column target_vocabulary = Vocabulary(add_unknown_token=False) # add elements to Target Vocabulary for target in sorted(set(dataset_df.target)): target_vocabulary.add_token(target) # Tweet Tokenizer to split text into tokens tokenizer = TweetTokenizer() # add word to the Text Vocabulary, if its frequency > cutoff word_counts = Counter() # iterate through the dataset for text in dataset_df.text: # split text into words words = tokenizer.tokenize(text) # update word_counts for all words in the text for word in words: word_counts[word] += 1 # for all extacted words for word, count in word_counts.items(): # if the word is not punctuation and it appears more than @cutoff times, add it to the Vocabulary if (word not in string.punctuation) and (count > cutoff): # add token to the Vocabulary text_vocabulary.add_token(word) return cls(text_vocabulary, target_vocabulary)
def simple_run(): """train without k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 word_train, word_dev, label_train, label_dev = dev_split(config.train_dir) # simple run without k-fold run(word_train, label_train, word_dev, label_dev, vocab, device)
def initialize_model_from_local(path, hparams, de_tokenize=False, verbose=False): start = time.time() conn = sqlite3.connect(path) c = conn.cursor() v = Vocabulary(max_len=hparams['MAX_LEN']) v.load_vocab_from_local(c, hparams['VOCAB'], verbose) c.close() conn.close() if de_tokenize: v.de_tokenize_data() if verbose: print('Vocabulary created!') enc, dec, opt = create_model(hparams) print('Time to initialize model {:.2f} min | {:.2f} hrs\n'.format( (time.time() - start) / 60, (time.time() - start) / 3600)) return v, enc, dec, opt
def __init__(self, ngram_file, insert_word_posn, index_insert_words, min_count): print "\nNGramIndex: Initializing a new index" self.index_insert_words = index_insert_words self.counts_total = 0 self.num_uniq_ngrams = 0 self.vocab = Vocabulary() print "Now examining ngram counts in", ngram_file max_lines = self.ngram_file_mincount_line(ngram_file, min_count) print "For a min ngram count of ", min_count, print "need to read", max_lines, "ngrams from", ngram_file self.ngram_hash = numpy.zeros(max_lines, dtype=numpy.int64) self.ngram_count = numpy.zeros(max_lines, dtype=numpy.int32) if index_insert_words: self.ngram_gapword = numpy.zeros(max_lines, dtype=numpy.int32) self.build_index(ngram_file, insert_word_posn, index_insert_words, max_lines, min_count)
def prep_dataset(): wiki_path = WIKI_PATH if CONTEXT_CAPACITY % 2 != 0: raise Exception("Context length should be even") context_window = CONTEXT_CAPACITY + 1 print("Loading...", end="") wiki = WikiDataLoader(wiki_path) voc = Vocabulary() tok = Tokenizer() print("done") wiki_doc = wiki.next_doc() wikiprep = open("WikiPrepData.txt", "w") i = 0 while wiki_doc: doc = tok(wiki_doc) voc.add(doc) sample = np.array(voc.text2ids(doc)) indexer = np.arange(context_window)[None, :] + np.arange( len(sample) - context_window)[:, None] smpl = sample[indexer] for row in smpl: for val in row: wikiprep.write("%d " % val) wikiprep.write("\n") i += 1 if i == 2000: break wiki_doc = wiki.next_doc() pickle.dump(voc, open("WikiPrepVoc.pkl", "wb")) print("Vocabulary ready")
def main_freq(): logging.info("Loading dataset") dataset = load_dataset("ag_news") dataset_text = [r['text'] for r in dataset['train']] dataset_labels = [r['label'] for r in dataset['train']] logging.info("Building vocabulary") vocab = Vocabulary(dataset_text) vocab.make_vocab_charts() plt.close() plt.pause(0.01) logging.info("Computing PPMI matrix") PPMI = compute_ppmi_matrix([doc['text'] for doc in dataset['train']], vocab) logging.info("Performing Truncated SVD to reduce dimensionality") word_vectors = dim_reduce(PPMI) logging.info("Preparing T-SNE plot") plot_word_vectors_tsne(word_vectors, vocab)
def k_fold_run(): """train with k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 data = np.load(config.train_dir, allow_pickle=True) words = data["words"] labels = data["labels"] kf = KFold(n_splits=config.n_split) kf_data = kf.split(words, labels) kf_index = 0 total_test_loss = 0 total_f1 = 0 for train_index, dev_index in kf_data: kf_index += 1 word_train = words[train_index] label_train = labels[train_index] word_dev = words[dev_index] label_dev = labels[dev_index] test_loss, f1 = run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index) total_test_loss += test_loss total_f1 += f1 average_test_loss = float(total_test_loss) / config.n_split average_f1 = float(total_f1) / config.n_split logging.info("Average test loss: {} , average f1 score: {}".format( average_test_loss, average_f1))