-
Notifications
You must be signed in to change notification settings - Fork 0
/
textmanip.py
148 lines (126 loc) · 5.07 KB
/
textmanip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: set fileencoding=utf-8 :
import nltk
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
import csv
import numpy
import nltk.data
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize as wt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import AffinityPropagation
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, IncrementalPCA
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import association_rules
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
sp_pattern = re.compile( """[\.\!\"\s\?\-\,\']+""", re.M)
stupid_tokenizer = sp_pattern.split
french_stopwords = set(stopwords.words('french'))
filt_out = lambda text: [token for token in text if token.lower() not in french_stopwords]
fr_stop = lambda token: len(token) and token.lower() not in french_stopwords
data = u"""Nous recherchons -pour les besoins d'une société en plein essor- un petit jeune passionné,
plein d'entrain, pour travailler dans un domaine intellectuellement stimulant."""
data2 = u"""pour le compte d'une société en plein essor, nous recherchons un jeune qui veut se faire exploiter"""
data3 = u"""Nous avons un vrai métier, et on a besoin de produire
"""
## we are brave, we execute arbitrary code from unchecked origin \o/
## this code does a nltk.data.load('tokenizers/punkt/french.pickle') in your back
## executing unknown code from unchecked origin. I do not advise to do so!
print ("//".join(filt_out( wt(data, language="french"))))
### let's see if a regexp does better
print ("//".join(filt_out( stupid_tokenizer(data))))
stemmer = SnowballStemmer("french", ignore_stopwords=True)
stemmer2 = SnowballStemmer("french", ignore_stopwords=False)
print ("//".join(
map(
stemmer2.stem, filter(
fr_stop,
stupid_tokenizer(data)
)
)
))
text = " ".join(
map(
stemmer2.stem, filter(
fr_stop,
stupid_tokenizer(data)
)
)
)
corpus = []
corpusOriginal = []
with open('eggs.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
data=row[2]
text = " ".join(
map(
stemmer2.stem, filter(
fr_stop,
stupid_tokenizer(data.replace("é", "e").replace("è", "e").replace("â", "a")
.replace("ê", "e").replace("ù","u").replace("û","u")
.replace("ë","e").replace("ü","u").replace("à","a").replace(","," "))
)
)
)
corpusOriginal.append(data)
corpus.append(text)
<<<<<<< HEAD
corpus_train, corpus_test = train_test_split( corpus, test_size=0.80, random_state=42)
corpusOriginal_train,corpusOriginal_test = train_test_split( corpusOriginal, test_size=0.80, random_state=42)
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=2200)
X = vectorizer.fit_transform(corpus_train)
pca = PCA(n_components=100)
X_pca = pca.fit_transform(X.toarray())
svd = TruncatedSVD(n_components=50, n_iter=50, random_state=42)
=======
corpus_train, corpus_test = train_test_split( corpus, test_size=0.85, random_state=42)
corpusOriginal_train,corpusOriginal_test = train_test_split( corpusOriginal, test_size=0.85, random_state=42)
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=2000)
X = vectorizer.fit_transform(corpus_train)
pca = PCA(n_components=100)
X_pca = pca.fit_transform(X.toarray())
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
>>>>>>> 34789942ce88d804dc82c01f795872431a2e3ea7
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
Xsvd = lsa.fit_transform(X)
print(vectorizer.get_feature_names())
print(X.shape)
print(X)
#X_train, X_test = train_test_split( X, test_size=0.90, random_state=42)
<<<<<<< HEAD
clustering = KMeans(n_clusters=3, random_state=0).fit(Xsvd)
=======
clustering = KMeans(n_clusters=4, random_state=0).fit(Xsvd)
>>>>>>> 34789942ce88d804dc82c01f795872431a2e3ea7
print (clustering.labels_)
print (X)
numpy.savetxt("labels.csv", clustering.labels_, delimiter=",")
f = open("titres.csv", "w")
f.write(str(vectorizer.get_feature_names()))
f.close()
numpy.savetxt("train.csv", X.toarray(), delimiter=",")
with open('full.csv', 'w') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in corpusOriginal_train:
spamwriter.writerow([row.replace("\n", "").replace(",", " ")])
pca = PCA(n_components=100)
X_pca = pca.fit_transform(X.toarray())
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clustering.labels_,
cmap=plt.cm.nipy_spectral)
plt.show()