Example #1
0
def dump_reuters():
    nb_total = 0
    synth_file = open(str('data_set\\' + 'synthese_reuters' + '.txt'), 'w')
    for cat in reuters.categories():
        # cat = 'housing'
        text_arr = np.unique(np.array(reuters.paras(categories=[cat])))
        file_object = open(str('data_set\\reuters_nltk\\' + cat + '.txt'), 'w')
        nb_paraph = 0
        for p in range(0, text_arr.shape[0]):
            len_para = 0
            for _, sent in enumerate(text_arr[p]):
                len_para = len_para + len(sent)
            if len_para > 50:
                paragraph = ''
                for i in range(0, len(text_arr[p])):
                    sent = ' '.join(text_arr[p][i])
                    paragraph = str(paragraph) + sent
                file_object.write(paragraph)
                file_object.write('\n')
                nb_paraph = nb_paraph + 1
        file_object.close()
        synth_file.write(
            str('categorie ' + cat + ' : ' + str(nb_paraph) + '\n'))
        nb_total = nb_total + nb_paraph
    synth_file.write(str('Total : ' + str(nb_total)))
    synth_file.close()
Example #2
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(normalize_tokens(content))

    w2v = Word2Vec(texts, size=100, window=5, min_count=5, workers=4)
    w2v.save(model_path)
Example #3
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)
    svd = TruncatedSVD(n_components=100)
    lsa = svd.fit_transform(tf.T)

    lsa.dump(open(model_path, 'wb'))
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
Example #4
0
    def get_most_populous_categs(self):
        cats = reuters.categories()

        categ_dict = {}

        total_multi = 0
        for c in cats:
            lcat = len(reuters.paras(categories=[c]))
            total_multi += lcat
            categ_dict[c] = lcat
        most_populous_categs = sorted(categ_dict.items(),
                                      key=itemgetter(1),
                                      reverse=True)
        # getting top 10
        top_10_poplous_categs = most_populous_categs[:10]

        top_10_poplous_categs
        top_10_poplous_categs_names = [i[0] for i in top_10_poplous_categs]
        return top_10_poplous_categs_names
Example #5
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)

    test_vocab = set()

    reader = csv.reader(open(global_truth_path))
    for line in reader:
        w1, w2, score = line
        test_vocab.add(stemmer.stem(w1))
        test_vocab.add(stemmer.stem(w2))
    test_vocab = {k: v for v, k in enumerate(test_vocab)}

    model = np.zeros((len(test_vocab), len(transformer.vocabulary_)))

    for text in texts:
        text = text.split()
        for i in range(len(text)):
            if text[i] not in test_vocab:
                continue
            for j in (i - window_size, i + window_size + 1):
                if j < 0 or j >= len(text):
                    continue
                if text[j] not in transformer.vocabulary_:
                    continue
                model[test_vocab[text[i]]][transformer.vocabulary_[
                    text[j]]] += 1
    model.dump(model_path)
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
    pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
Example #6
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)

    test_vocab = set()

    reader = csv.reader(open(global_truth_path))
    for line in reader:
        w1, w2, score = line
        test_vocab.add(stemmer.stem(w1))
        test_vocab.add(stemmer.stem(w2))
    test_vocab = {k: v for v, k in enumerate(test_vocab)}

    model = np.zeros((len(test_vocab), len(transformer.vocabulary_)))

    for text in texts:
        text = text.split()
        for i in range(len(text)):
            if text[i] not in test_vocab:
                continue
            for j in (i-window_size, i+window_size+1):
                if j < 0 or j >= len(text):
                    continue
                if text[j] not in transformer.vocabulary_:
                    continue
                model[test_vocab[text[i]]][transformer.vocabulary_[text[j]]] += 1
    model.dump(model_path)
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
    pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
Example #7
0
docs = reuters.fileids(categories=[category])
count = 0
docAmount = len(docs)

trainingPath = 'C:/MLprojekt/SSK/reuters/reuters/' + category + 'Training'
testPath = 'C:/MLprojekt/SSK/reuters/reuters/' + category + 'Test'
if (not os.path.exists(trainingPath)):
    os.makedirs(trainingPath)
if (not os.path.exists(testPath)):
    os.makedirs(testPath)

for doc in docs:
    count += 1
    if (doc.find('test') != -1):
        name = doc.replace('test/', '')
        document = reuters.paras(fileids=[doc])
        parseDoc(document[0], name, testPath)
    else:
        name = doc.replace('training/', '')
        document = reuters.paras(fileids=[doc])
        parseDoc(document[0], name, trainingPath)

    print(docAmount - count, ' documents left')

# Parses a file, removing all stopwords and saves it with the same name in the new folder
"""
def parse(file, filename, newPath):
    lines = file.read()
    words = lines.split()
    table = str.maketrans("", "", string.punctuation)