def main(argv): if len(argv) < 2: usage() # Fetch data texts, categories = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories"]) count = 0 for row in reader: count += 1 text, category = row['title'] + ' ' + row['description'], row[ 'categories'].split(' / ')[0] texts.append(text) categories.append(category) if count >= MAX_TEXTS: break print('Processed %s texts.' % len(texts)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from classifier classifier = ProductClassifier() labels = classifier.get_labels(categories) # Compile classifier network and train classifier.compile(tokenizer) classifier.train(data, labels)
def main(argv): if len(argv) < 2: usage() # Fetch data texts, tags = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories", "tags"]) count = 0 for row in reader: count += 1 text, tag_set = row['title'], row['tags'].split(' ')[:-1] texts.append(text) tags.append(tag_set) if count >= MAX_TEXTS: break print('Processed %s texts.' % len(texts)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from NER ner = ProductNER() labels = ner.get_labels(tags) # Compile NER network and train ner.compile(tokenizer) ner.train(data, labels)
class TfIdfEvaluator(SentencesEvaluator): def __init__(self, language_params): super(TfIdfEvaluator, self).__init__(language_params, "TF-IDF Evaluator") self.tokenizer = WordTokenizer(self.language_params) self.tf_idf = TfidfVectorizer(tokenizer=self.tokenizer.tokenize) def train(self, training_set): self.tf_idf.fit(training_set) def evaluate(self, sentences): words_weights = self.__get_words_weights(sentences) sentences_weights = [] for i, s in enumerate(sentences): words = self.tokenizer.tokenize(s) weights_sum = sum([words_weights.get(w, 0) for w in words]) if len(words) > 0: sentences_weights.append((i, float(weights_sum))) return sorted(sentences_weights, reverse=True) def __get_words_weights(self, test_set): weights = self.tf_idf.transform([''.join(test_set)]).toarray()[0] features = self.tf_idf.get_feature_names() f_weights = zip(features, weights) return dict(f_weights) def encode_list(self, list): return [self.__encode_text(a) for a in list] def __encode_text(self, text): return text.encode(sys.stdout.encoding, errors='replace')
def main(argv): if len(argv) < 2: usage() # Fetch data texts, categories = [], [] with open(sys.argv[1], 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "brand", "description", "categories"]) count = 0 for row in reader: count += 1 # TODO change here what we train on, and what categories are used text, category = row['title'], row['categories'].split(' / ')[0] texts.append(text) categories.append(category) if count >= MAX_TEXTS: break print(('Processed %s texts.' % len(texts))) tmpx, tmpy = [], [] c = Counter(categories) for x, y in zip(texts, categories): if c[y] > 200: tmpx.append(x) tmpy.append(y) texts = tmpx categories = tmpy print(Counter(tmpy)) # Tokenize texts tokenizer = WordTokenizer() tokenizer.load() data = tokenizer.tokenize(texts) # Get labels from classifier classifier = ProductClassifier() labels = classifier.get_labels(categories) # Compile classifier network and train classifier.compile(tokenizer) classifier.train(data, labels)
def __text_to_vector(self, sentence): tokenizer = WordTokenizer(self.language_params) words = tokenizer.tokenize(sentence) return Counter(words)
import sys, os, csv import numpy as np from operator import itemgetter from tokenizer import WordTokenizer from classifier import ProductClassifier model_dir = './models' tokenizer = WordTokenizer() tokenizer.load(os.path.join(model_dir, 'tokenizer')) # Load classifier classifier = ProductClassifier() classifier.load(os.path.join(model_dir, 'classifier')) data = tokenizer.tokenize(["Cambridge wall calender"]) classScores = classifier.classify(data)[0] print(classScores) bestValIdx = np.argmax(classScores.values()) bestVal = classScores.values()[bestValIdx] bestClass = list(classScores)[bestValIdx] print(bestVal, bestClass)
# import model and tokenizer from model import CMLA from tokenizer import WordTokenizer # model directory model_dir = "results/SemEval2015" # sample text text = "The ambience is nice for conversation." text = "The staff was really nice but the food was disgusting!" # text = "In the summer months, the back garden area is really nice." # text = "Das Essen war sehr lecker." # load tokenizer and model print("Loading Tokenizer and Model...") tokenizer = WordTokenizer(os.path.join(model_dir, 'vocab.txt')) model = CMLA.load(model_dir) model.eval() # tokenize text tokens = tokenizer.tokenize(text) token_ids = tokenizer.convert_tokens_to_ids(tokens) # convert to tensor and pass through model token_ids = torch.LongTensor([token_ids]) aspect_logits, opinion_logits = model.forward(token_ids) # get predictions from logits aspect_predicts = aspect_logits[0, :].max(dim=-1)[1] opinion_predicts = opinion_logits[0, :].max(dim=-1)[1] print(tokens) print(aspect_predicts) print(opinion_predicts)
def evaluate(self, sentences): tokenizer = WordTokenizer(self.language_params) tokenized_sents = [tokenizer.tokenize(s) for s in sentences] return self._get_lengths(tokenized_sents)