def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, categories = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f, fieldnames=["title", "brand", "description", "categories"])
        count = 0
        for row in reader:
            count += 1
            text, category = row['title'] + ' ' + row['description'], row[
                'categories'].split(' / ')[0]
            texts.append(text)
            categories.append(category)
            if count >= MAX_TEXTS:
                break
    print('Processed %s texts.' % len(texts))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from classifier
    classifier = ProductClassifier()
    labels = classifier.get_labels(categories)

    # Compile classifier network and train
    classifier.compile(tokenizer)
    classifier.train(data, labels)
Exemple #2
0
def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, tags = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f,
            fieldnames=["title", "brand", "description", "categories", "tags"])
        count = 0
        for row in reader:
            count += 1
            text, tag_set = row['title'], row['tags'].split(' ')[:-1]
            texts.append(text)
            tags.append(tag_set)
            if count >= MAX_TEXTS:
                break
    print('Processed %s texts.' % len(texts))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from NER
    ner = ProductNER()
    labels = ner.get_labels(tags)

    # Compile NER network and train
    ner.compile(tokenizer)
    ner.train(data, labels)
class TfIdfEvaluator(SentencesEvaluator):
    def __init__(self, language_params):
        super(TfIdfEvaluator, self).__init__(language_params, "TF-IDF Evaluator")
        self.tokenizer = WordTokenizer(self.language_params)
        self.tf_idf = TfidfVectorizer(tokenizer=self.tokenizer.tokenize)

    def train(self, training_set):
        self.tf_idf.fit(training_set)

    def evaluate(self, sentences):
        words_weights = self.__get_words_weights(sentences)
        sentences_weights = []

        for i, s in enumerate(sentences):
            words = self.tokenizer.tokenize(s)
            weights_sum = sum([words_weights.get(w, 0) for w in words])
            if len(words) > 0:
                sentences_weights.append((i, float(weights_sum)))

        return sorted(sentences_weights, reverse=True)

    def __get_words_weights(self, test_set):
        weights = self.tf_idf.transform([''.join(test_set)]).toarray()[0]
        features = self.tf_idf.get_feature_names()
        f_weights = zip(features, weights)
        return dict(f_weights)

    def encode_list(self, list):
        return [self.__encode_text(a) for a in list]

    def __encode_text(self, text):
        return text.encode(sys.stdout.encoding, errors='replace')
def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, categories = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f, fieldnames=["title", "brand", "description", "categories"])
        count = 0
        for row in reader:
            count += 1
            # TODO change here what we train on, and what categories are used
            text, category = row['title'], row['categories'].split(' / ')[0]
            texts.append(text)
            categories.append(category)
            if count >= MAX_TEXTS:
                break
    print(('Processed %s texts.' % len(texts)))

    tmpx, tmpy = [], []
    c = Counter(categories)
    for x, y in zip(texts, categories):
        if c[y] > 200:
            tmpx.append(x)
            tmpy.append(y)

    texts = tmpx
    categories = tmpy

    print(Counter(tmpy))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from classifier
    classifier = ProductClassifier()
    labels = classifier.get_labels(categories)

    # Compile classifier network and train
    classifier.compile(tokenizer)
    classifier.train(data, labels)
 def __text_to_vector(self, sentence):
     tokenizer = WordTokenizer(self.language_params)
     words = tokenizer.tokenize(sentence)
     return Counter(words)
Exemple #6
0
import sys, os, csv
import numpy as np
from operator import itemgetter
from tokenizer import WordTokenizer
from classifier import ProductClassifier
model_dir = './models'

tokenizer = WordTokenizer()
tokenizer.load(os.path.join(model_dir, 'tokenizer'))

# Load classifier
classifier = ProductClassifier()
classifier.load(os.path.join(model_dir, 'classifier'))

data = tokenizer.tokenize(["Cambridge wall calender"])
classScores = classifier.classify(data)[0]
print(classScores)
bestValIdx = np.argmax(classScores.values())
bestVal = classScores.values()[bestValIdx]
bestClass = list(classScores)[bestValIdx]
print(bestVal, bestClass)
Exemple #7
0
# import model and tokenizer
from model import CMLA
from tokenizer import WordTokenizer

# model directory
model_dir = "results/SemEval2015"
# sample text
text = "The ambience is nice for conversation."
text = "The staff was really nice but the food was disgusting!"
# text = "In the summer months, the back garden area is really nice."
# text = "Das Essen war sehr lecker."

# load tokenizer and model
print("Loading Tokenizer and Model...")
tokenizer = WordTokenizer(os.path.join(model_dir, 'vocab.txt'))
model = CMLA.load(model_dir)
model.eval()
# tokenize text
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
# convert to tensor and pass through model
token_ids = torch.LongTensor([token_ids])
aspect_logits, opinion_logits = model.forward(token_ids)
# get predictions from logits
aspect_predicts = aspect_logits[0, :].max(dim=-1)[1]
opinion_predicts = opinion_logits[0, :].max(dim=-1)[1]

print(tokens)
print(aspect_predicts)
print(opinion_predicts)
    def evaluate(self, sentences):
        tokenizer = WordTokenizer(self.language_params)
        tokenized_sents = [tokenizer.tokenize(s) for s in sentences]

        return self._get_lengths(tokenized_sents)