def load_benchmark(data, vocab, extend_with=0): # Decode data try: texts = [x for x in data['text']] except UnicodeDecodeError: texts = [x for x in data['text']] # Extract labels labels = [x for x in data['sent']] batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test(texts, labels, extend_with=extend_with) return { 'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen }
def load_benchmark(path, vocab, extend_with=0): """ Loads the given benchmark dataset. Tokenizes the texts using the provided vocabulary, extending it with words from the training dataset if extend_with > 0. Splits them into three lists: training, validation and testing (in that order). Also calculates the maximum length of the texts and the suggested batch_size. # Arguments: path: Path to the dataset to be loaded. vocab: Vocabulary to be used for tokenizing texts. extend_with: If > 0, the vocabulary will be extended with up to extend_with tokens from the training set before tokenizing. # Returns: A dictionary with the following fields: texts: List of three lists, containing tokenized inputs for training, validation and testing (in that order). labels: List of three lists, containing labels for training, validation and testing (in that order). added: Number of tokens added to the vocabulary. batch_size: Batch size. maxlen: Maximum length of an input. """ # Pre-processing dataset with open(path) as dataset: data = pickle.load(dataset) # Decode data try: texts = [unicode(x) for x in data['texts']] except UnicodeDecodeError: texts = [x.decode('utf-8') for x in data['texts']] # Extract labels labels = [x['label'] for x in data['info']] batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test( texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) return { 'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen }
def print_examples_sentences(filename: str = "example.txt", output_filename: str = "output.txt"): examples = read_examples(filename) with open(output_filename, 'w') as f: for example in examples: sentences = SentenceTokenizer().tokenize(example) for sentence in sentences: f.write(sentence + "\n") f.close()
def load_model_1(): # load the pre-trained Keras model (here we are using a model # pre-trained on ImageNet and provided by Keras, but you can # substitute in your own networks just as easily) global model #model = ResNet50(weights="imagenet") """ .h5 created externally and sent here. The attentionlayer from .py """ model = load_model( 'emoji.h5', custom_objects={'AttentionWeightedAverage': AttentionWeightedAverage}, compile=True) global st with open("vocabulary.json", 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30)
from sentence_tokenizer import SentenceTokenizer import json import numpy as np def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] maxlen = 50 batch_size = 32 vocabulary = json.load(open('servers/deepmoji/data/vocabulary.json', 'r')) st = SentenceTokenizer(vocabulary, maxlen) model = deepmoji_emojis(maxlen, 'servers/deepmoji/data/deepmoji_weights.hdf5') #print('Ready') while True: sentence = input() tokenized, _, _ = st.tokenize_sentences([sentence]) prob = model.predict(tokenized)[0] scores = [] """ t_token = tokenized[0] t_score = [sentence] t_prob = prob[0] ind_top = top_elements(t_prob, 5) t_score.append(sum(t_prob[ind_top])) t_score.extend(ind_top)
# load data data_pair = load_data(data_path) # split 5 fold data_5fold = prepare_5fold(data_pair) # load vocabulary and label2index dict with open(vocab_path, "r") as f_vocab: vocabulary = json.load(f_vocab) with open(label2index_path, "r") as f_label: label2index = json.load(f_label) index2label = {i: l for (l, i) in label2index.items()} # sentence tokenizer (MAXLEN means the max length of input text) st = SentenceTokenizer(vocabulary, MAX_LEN) fold = 0 # 5 fold for item in data_5fold: # prepare training, validation, testing set train_text = [p[0] for p in item[0]] train_label = [p[1] for p in item[0]] test_text = [p[0] for p in item[1]] test_label = [p[1] for p in item[1]] train_X, _, _ = st.tokenize_sentences(train_text) test_X, _, _ = st.tokenize_sentences(test_text) train_y = np.array([label2index[l] for l in train_label]) test_y = np.array([label2index[l] for l in test_label])
def load_benchmark(path, vocab, extend_with=0): """ Loads the given benchmark dataset. Tokenizes the texts using the provided vocabulary, extending it with words from the training dataset if extend_with > 0. Splits them into three lists: training, validation and testing (in that order). Also calculates the maximum length of the texts and the suggested batch_size. # Arguments: path: Path to the dataset to be loaded. vocab: Vocabulary to be used for tokenizing texts. extend_with: If > 0, the vocabulary will be extended with up to extend_with tokens from the training set before tokenizing. # Returns: A dictionary with the following fields: texts: List of three lists, containing tokenized inputs for training, validation and testing (in that order). labels: List of three lists, containing labels for training, validation and testing (in that order). added: Number of tokens added to the vocabulary. batch_size: Batch size. maxlen: Maximum length of an input. """ # Pre-processing dataset f = open(path, 'r') texts = [] labels = [] for line in f: line = line.strip().split('\t') if len(line) < 2: continue texts.append(line[0]) labels.append(int(line[1])) # Decode data # try: # texts = [unicode(x) for x in data['texts']] # except UnicodeDecodeError: # texts = [x.decode('utf-8') for x in data['texts']] # Extract labels # labels = [x['label'] for x in data['info']] print(texts[0:10]) print(labels[0:10]) batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test(texts, labels, [0.88, 0.1, 0.02], extend_with=extend_with) return { 'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen }
raw_dpath = os.path.join(os.getenv("OPIE_DIR"), "data/raw/domains") domain_path = os.path.join(os.getenv("OPIE_DIR"), "data/domains", domain) pickle_head = "/pickles/without_parse_sentences/without_parse_sentences_" fname = domain + ".json.gz" mkdir(domain_path) spath = os.path.join(domain_path, "sentences") if os.path.exists(spath): shutil.rmtree(spath) mkdir(os.path.join(domain_path, "sentences")) mkdir(os.path.join(domain_path, "pickles")) mkdir(os.path.join(domain_path, "pickles/without_parse_sentences")) sentences, i, k, review_index = [], 0, 1, 1 kk = 1 f = open(domain_path + "/sentences/sentences_1.txt", "a", encoding="utf8") flag = False myTokenizer = SentenceTokenizer() for e in parse(os.path.join(raw_dpath, fname)): text, score = e['reviewText'], float(e['overall']) # 去除所有的控制字符,防止parse出错 text = re.sub(r'[\x00-\x1f]', '', text) sents = myTokenizer.segment_text(text) for sent in sents: t = Sentence() t.set_text_score_review(sent, score, review_index) if len(sent.split(' ')) > 50: continue sentences.append(t) print(sent, file=f) # 60000个句子序列化一次 if len(sentences) == 60000: save_pickle_file(domain_path + pickle_head +