def _csv_iterator(data_path, tokenizer, ngrams, yield_cls=False): with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: tokens = ' '.join(row[1:]) tokens = tokenizer(tokens) if yield_cls: yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def _csv_iterator(data_path, ngrams, yield_cls=False): tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = csv.reader(f) for row in reader: tokens = " ".join(row[1:]) tokens = tokenizer(tokens) if yield_cls: yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def _imdb_iterator(key, extracted_files, tokenizer, ngrams, yield_cls=False): for fname in extracted_files: if 'urls' in fname: continue elif key in fname and ('pos' in fname or 'neg' in fname): with io.open(fname, encoding="utf8") as f: label = 1 if 'pos' in fname else 0 if yield_cls: yield label, ngrams_iterator(tokenizer(f.read()), ngrams) else: yield ngrams_iterator(tokenizer(f.read()), ngrams)
def _csv_iterator(data_path, ngrams=1, yield_cls=False): tokenizer = get_tokenizer("basic_english") toxic = pd.read_csv(data_path) for i in range(len(toxic.tweet)): tokens = toxic.tweet[i] tokens = tokenizer(tokens) if yield_cls: yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def _csv_iterator(data_path, ngrams, yield_cls=False): tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: tokens = row[1] tokens = tokenizer(tokens) if yield_cls: yield 1 if int(row[4]) == 3 else 0, ngrams_iterator( tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def _csv_iterator(data_path, ngrams, dataset_name=None, yield_cls=False): # tokenizer = get_tokenizer("basic_english") tokenizer = get_tokenizer("spacy", language="en_core_web_sm") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f, delimiter='\t') for row in reader: tokens = row[1] tokens = tokenizer(tokens) if yield_cls: label = int(LABELS[dataset_name][row[0]]) - 1 yield label, ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def _csv_iterator(data_path, ngrams, yield_cls=False, label=-1): tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f, delimiter="\t") for row in reader: tokens = ' '.join([row[5]]) #print(row[5]) tokens = tokenizer(tokens) if yield_cls: yield row[7], ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def csv_iterator(data_path, ngrams): tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: tokens = ' '.join(row[1:]) yield ngrams_iterator(tokenizer(tokens), ngrams)
def _compute_ngram_counter(tokens, max_n): """ Create a Counter with a count of unique n-grams in the tokens list Arguments: tokens: a list of tokens (typically a string split on whitespaces) max_n: the maximum order of n-gram wanted Outputs: output: a collections.Counter object with the unique n-grams and their associated count Examples: >>> from torchtext.data.metrics import _compute_ngram_counter >>> tokens = ['me', 'me', 'you'] >>> _compute_ngram_counter(tokens, 2) Counter({('me',): 2, ('you',): 1, ('me', 'me'): 1, ('me', 'you'): 1, ('me', 'me', 'you'): 1}) """ assert max_n > 0 ngrams_counter = collections.Counter( tuple(x.split(' ')) for x in ngrams_iterator(tokens, max_n)) return ngrams_counter
def text_to_tensor(text, vocab, ngrams): tokens = ngrams_iterator(tokenizer(text), ngrams=ngrams) token_ids = list( filter(lambda x: x is not Vocab.UNK, [vocab[token] for token in tokens])) tokens = torch.tensor(token_ids) return tokens
def classify(text): print('predicting [' + text + ']') # normalize input string text = re.sub(r'([\u4e00-\u9fff])', r' \1', text) l = list(utils.ngrams_iterator(_basic_english_normalize(text), 2)) l = [[stoi.get(token, 0) for token in l]] text = torch.tensor(l) with torch.no_grad(): result = model(text, None) # sort result according score value, index = (torch.sort(result, descending=True)) classIdx = [] scores = [] print("===========================================") # just pick 3 most relevant class for i in range(0, 3): idx = index[0][i].item() + 1 score = value[0][i].item() print("class:{}, score:{}".format(class_idx_to_name[idx], score)) classIdx.append(idx) scores.append(int(score)) return classIdx, scores
def preprocess(self, data): """ Normalizes the input text for PyTorch model using following basic cleanup operations : - remove html tags - lowercase all text - expand contractions [like I'd -> I would, don't -> do not] - remove accented characters - remove punctuations Converts the normalized text to tensor using the source_vocab. Returns a Tensor """ line = data[0] text = line.get("data") or line.get("body") if isinstance(text, (bytes, bytearray)): text = text.decode('utf-8') text = self._remove_html_tags(text) text = text.lower() text = self._expand_contractions(text) text = self._remove_accented_characters(text) text = self._remove_punctuation(text) text = self._tokenize(text) text = torch.as_tensor([ self.source_vocab[token] for token in ngrams_iterator(text, self.ngrams) ], device=self.device) return text
def preprocess(self, data): """ Normalizes the input text for PyTorch model using following basic cleanup operations : - remove html tags - lowercase all text - expand contractions [like I'd -> I would, don't -> do not] - remove accented characters - remove punctuations Converts the normalized text to tensor using the source_vocab. Returns a Numpy array. """ ngrams = 2 text = data[0].get("data") if text is None: text = data[0].get("body") text = text.decode('utf-8') text = self._remove_html_tags(text) text = text.lower() text = self._expand_contractions(text) text = self._remove_accented_characters(text) text = self._remove_punctuation(text) text = self._tokenize(text) text = torch.tensor([ self.source_vocab[token] for token in ngrams_iterator(text, ngrams) ]) return text
def __getitem__(self, i): raw_datum: Example = super().__getitem__(i) tokens = raw_datum.text ngrams = list(ngrams_iterator(tokens, self.ngrams)) text = self.fields["text"].numericalize([ngrams]).squeeze() label = int(self.fields["label"].numericalize([raw_datum.label])) return label, text
def testing_predict(model, vocabulary, ngrams): tokenizer = get_tokenizer("basic_english") ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"} ex_text_str = """ MEMPHIS, Tenn. – Four days ago, Jon Rahm was enduring the season’s worst weather conditions on Sunday at The Open on his way to a closing 75 at Royal Portrush, which considering the wind and the rain was a respectable showing. Thursday’s first round at the WGC-FedEx St. Jude Invitational was another story. With temperatures in the mid-80s and hardly any wind, the Spaniard was 13 strokes better in a flawless round. Thanks to his best putting performance on the PGA Tour, Rahm finished with an 8-under 62 for a three-stroke lead, which was even more impressive considering he’d never played the front nine at TPC Southwind. """ with torch.no_grad(): text = torch.tensor([ vocabulary[token] for token in ngrams_iterator(tokenizer(ex_text_str), ngrams) ]) output = model(text, torch.tensor([0])) print("\nTesting the prediction of sample text:") print(ex_text_str) print("This is a %s news" % ag_news_label[output.argmax(1).item() + 1])
def predict(text, model, vocab, ngrams): tokenizer = get_tokenizer("basic_english") with torch.no_grad(): text = torch.tensor([vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams)]) output = model(text, torch.tensor([0])) return output.argmax(1).item() + 1
def run(raw_data): global model, dictionary, tokenizer, ngrams, device prev_time = time.time() post = json.loads(raw_data) incoming_text = post['text'] with torch.no_grad(): text = torch.tensor([ dictionary[token] for token in ngrams_iterator(tokenizer(incoming_text), ngrams) ]) output = model(text.to(device), torch.tensor([0]).to(device)) current_time = time.time() inference_time = datetime.timedelta(seconds=current_time - prev_time) payload = { 'time': str(inference_time.total_seconds()), 'text': incoming_text, 'scores': output[0].tolist(), 'rating': output.argmax(1).item() + 1 } print('Input ({}), Prediction ({})'.format(text, payload)) return payload
def transform_data(a_dataset, unique_vocab_dict, classifier_name, NGRAMS): if classifier_name == 'LR': train_X = torch.zeros(len(a_dataset), len(unique_vocab_dict)) for i in tqdm(range(len(a_dataset))): tokens = nltk.word_tokenize(a_dataset[i][0]) tokens = tokens if NGRAMS == 1 else ngrams_iterator(tokens, NGRAMS) for j in tokens: if j in [ string.punctuation, 'to', 'and', 'the', 'be', 'a', 'is', 'that', 'of' ]: continue try: train_X[i][unique_vocab_dict[j]] += 0.5 except: pass train_Y = torch.Tensor([i[1] for i in a_dataset]).long() else: train_X = convert_texts_to_ids([x[0] for x in a_dataset], unique_vocab_dict, max_seq_length=17, do_lower_case=False, sos=False, eos=False) train_X = torch.Tensor(train_X) train_Y = torch.Tensor([x[1] for x in a_dataset]).long() # Labels = train_Y # train_Y = torch.zeros(len(a_dataset), 2) # for i in Labels: # train_Y[i] = torch.tensor(np.eye(2)[i]) return train_X, train_Y
def _pd_iterator(data_to_parse: np.ndarray, ngrams: int, yield_cls: bool = False): """ :param data_to_parse: array of two colums with label and text :param ngrams: amount of ngrams :param yield_cls: return text with label or without :return: generator needed in future parsing for torch """ tokenizer = get_tokenizer(None) for row_id in range(len(data_to_parse)): tokens = data_to_parse[row_id][1] tokens = tokenizer(tokens) if yield_cls: yield data_to_parse[row_id][0], ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def predict(text): tokenizer = get_tokenizer("basic_english") vocab = train_dataset.get_vocab() with torch.no_grad(): text = torch.tensor([ vocab[token] for token in ngrams_iterator(tokenizer(text), NGRAMS) ]) output = model(text, torch.tensor([0])) return "Relaxing" if output.argmax(1).item() == 1 else "Not Relaxing"
def ToEmbed(text, model): tokenizer = get_tokenizer("basic_english") tokenized_text = tokenizer(text) origin_tensor = torch.tensor( [model.vocab[token] for token in ngrams_iterator(tokenized_text, 1)]) origin_tensor = torch.stack([origin_tensor], 0) origin_tensor = model.embed(origin_tensor) return origin_tensor
def build_vocab(xlist, NGRAMS, min_count): vocabi2w = ['[SOS]', '[EOS]', '[PAD]', '[UNK]'] # A list of unique words seen = collections.defaultdict(int) for i in tqdm(range(len(xlist))): tokens = nltk.word_tokenize(xlist[i][0]) tokens = tokens if NGRAMS == 1 else ngrams_iterator(tokens, NGRAMS) for token in tokens: seen[token] += 1 vocabi2w += [x for x in seen if seen[x] >= min_count] vocabw2i = {vocabi2w[x]: x for x in range(len(vocabi2w))} return vocabw2i, vocabi2w
def _data_iterator(data_rows, ngrams, yield_cls=False): """[summary] Args: data_rows ([type]): [description] ngrams ([type]): [description] yield_cls (bool, optional): [description]. Defaults to False. Yields: [type]: [description] """ tokenizer = get_tokenizer("basic_english") for row in data_rows: tokens = ' '.join(row[1:]) tokens = tokenizer(tokens) if yield_cls: yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def predict(self, text): tokenizer = get_tokenizer("basic_english") with torch.no_grad(): text = torch.tensor([ self.vocab[token] for token in ngrams_iterator(tokenizer(text), self.ngrams) ]) output = self.model(text, torch.tensor([0])) result = output.argmax(1).item() label = self.labels[result + 1] return label
def csv_iterator(data_path, ngrams, yield_cls=False): """ 加载csv文本文件,并根据原始文本 生成 指定ngram语法的 词汇(token)样本 Args: data_path: ngrams: yield_cls: Returns: """ with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: tokens = ' '.join(row[1:]) tokens = tokenizer(tokens) if yield_cls: yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def predict(text, model, vocab, ngrams): # tokenizer = get_tokenizer("basic_english") tokenizer = text.split() print('tokenizer', tokenizer, '\n') with torch.no_grad(): text = torch.tensor( [vocab[token] for token in ngrams_iterator(tokenizer, ngrams)]) #print("word_set: ", word_set, '\n') # 테스트 위해 사용 #print("text: ", text, "\n") # 테스트 위해 사용 output = model(text, torch.tensor([0])) print("output: ", output, '\n') return output.argmax(1).item() + 1
def predict_review_sentiment(text): # Convert text to tensor text = torch.tensor( [VOCAB[token] for token in ngrams_iterator(TOKENIZER(text), NGRAMS)] ) # Compute output # TODO compute the output of the model. Note that you will have to give it a 0 as an offset. output = ... confidences = torch.softmax(output, dim=1) return confidences.squeeze()[ 1 ].item() # Class 1 corresponds to confidence of positive
def _csv_iterator(data_path, ngrams, skip_header=True, yield_cls=False, label_col=6, token_col=[1, 5], label_mapping={ "simulation": 0, "hardware": 1, "edge_computing": 2 }): tokenizer = get_tokenizer("spacy", "en_core_web_sm") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) if skip_header: next(reader, None) for row in reader: tokens = ' '.join([j for i, j in enumerate(row) if i in token_col]) tokens = tokenizer(tokens) if yield_cls: yield label_mapping[row[label_col]], ngrams_iterator( tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def predict_fn(sentence, model_dict): logger.info('predict_fn: Predicting for {}.'.format(sentence)) model = model_dict['model'] dictionary = model_dict['dictionary'] with torch.no_grad(): sentence_tensor = torch.tensor([ dictionary[token] for token in ngrams_iterator(_tokenizer(sentence), _ngrams) ]) output = model(sentence_tensor, torch.tensor([0])) label = output.argmax(1).item() + 1 logger.info('predict_fn: Prediction result is {}.'.format(label)) return label
def predict(text, model, vocab, ngrams): tokenizer = get_tokenizer("basic_english") with torch.no_grad(): text = torch.tensor([ vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams) ]) output = model(text, torch.tensor([0])) ret = output > THRESHOLD result = [] print(ret) cnt = 0 for r in ret[0]: if r: result.append(cnt) cnt += 1 return result