コード例 #1
0
def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, categories = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f, fieldnames=["title", "brand", "description", "categories"])
        count = 0
        for row in reader:
            count += 1
            text, category = row['title'] + ' ' + row['description'], row[
                'categories'].split(' / ')[0]
            texts.append(text)
            categories.append(category)
            if count >= MAX_TEXTS:
                break
    print('Processed %s texts.' % len(texts))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from classifier
    classifier = ProductClassifier()
    labels = classifier.get_labels(categories)

    # Compile classifier network and train
    classifier.compile(tokenizer)
    classifier.train(data, labels)
コード例 #2
0
def main(argv):
    if len(argv) < 2:
        usage()

    # Fetch data
    texts, categories = [], []
    with open(sys.argv[1], 'rb') as f:
        reader = csv.DictReader(
            f, fieldnames=["title", "brand", "description", "categories"])
        count = 0
        for row in reader:
            count += 1
            # TODO change here what we train on, and what categories are used
            text, category = row['title'], row['categories'].split(' / ')[0]
            texts.append(text)
            categories.append(category)
            if count >= MAX_TEXTS:
                break
    print(('Processed %s texts.' % len(texts)))

    tmpx, tmpy = [], []
    c = Counter(categories)
    for x, y in zip(texts, categories):
        if c[y] > 200:
            tmpx.append(x)
            tmpy.append(y)

    texts = tmpx
    categories = tmpy

    print(Counter(tmpy))

    # Tokenize texts
    tokenizer = WordTokenizer()
    tokenizer.load()
    data = tokenizer.tokenize(texts)

    # Get labels from classifier
    classifier = ProductClassifier()
    labels = classifier.get_labels(categories)

    # Compile classifier network and train
    classifier.compile(tokenizer)
    classifier.train(data, labels)