class ToySentimentClassifier(object): def __init__(self): self.feature_extractor = FeatureExtractor() def extract_features(self, doc): all_features = {} for i in range(1, 4): # range numero di ngrammi da calcolare all_features.update( self.feature_extractor.extract_word_ngrams( doc, i)) # restituisce features degli ngrammi for i in range(1, 4): all_features.update( self.feature_extractor.extract_lemma_ngrams(doc, i)) for i in range(3, 6): all_features.update(self.feature_extractor.compute_n_chars(doc, i)) all_features.update( self.feature_extractor.compute_document_length(doc)) we_feats = self.feature_extractor.compute_embeddings(doc, embeddings) if we_feats: all_features.update(we_feats) return all_features def train(self, model_name, input_file_name): reader = InputReader(input_file_name) all_docs = [] for doc in reader.generate_documents(): doc.features = self.extract_features(doc) all_docs.append(doc) # lista con documenti+features del documento print(len(all_docs)) # Encoding of samples all_collected_feats = [doc.features for doc in all_docs] # trasformare features in vettori (vettore singolo doc, matrice collezione di documenti) X_dict_vectorizer = DictVectorizer( sparse=True ) # funzione di sklearn - prende un dizionario key:id feature value: valore feature # trasforma in matrice encoded_features = X_dict_vectorizer.fit_transform( all_collected_feats ) # crea matrice (sparse=True matrice con hashmap) # Scale to increase performances and reduce training time # vogliamo che ogni feature contribuisca in modo equo - scalare feature fra 0 e 1 scaler = preprocessing.StandardScaler(with_mean=False).fit( encoded_features ) # calcola i parametri che devono essere usati per scalare encoded_scaled_features = scaler.transform( encoded_features) # scala i parametri # Encoding of labels (Y) label_encoder = preprocessing.LabelEncoder( ) # è multilabel, anche le classi vanno scalate label_encoder.fit([doc.label for doc in all_docs]) encoded_labels = label_encoder.transform( [doc.label for doc in all_docs]) # Classifier Algorithm scoring = ['accuracy', 'precision', 'recall', 'f1'] clf = SVC(kernel='linear', C=1e3) # Cross validation cross_val_scores = cross_validate(clf, encoded_scaled_features, encoded_labels, cv=10, scoring=scoring) print('accuracy\tprecision\trecall\tf1\n') print( str(np.average(cross_val_scores['test_accuracy'])) + '\t' + str(np.average(cross_val_scores['test_precision'])) + '\t' + str(np.average(cross_val_scores['test_recall'])) + '\t' + str(np.average(cross_val_scores['test_f1'])) + '\n')
class ToySentimentClassifier(object): def __init__(self): self.feature_extractor = FeatureExtractor() def extract_features(self, doc): all_features = {} for i in range(1, 3): all_features.update( self.feature_extractor.extract_word_ngrams(doc, i)) for i in range(1, 3): all_features.update( self.feature_extractor.extract_lemma_ngrams(doc, i)) for i in range(1, 3): all_features.update(self.feature_extractor.compute_n_chars(doc, i)) all_features.update( self.feature_extractor.compute_document_length(doc)) return all_features def train(self, model_name, input_file_name): reader = InputReader(input_file_name) all_docs = [] for doc in reader.generate_documents(): doc.features = self.extract_features(doc) all_docs.append(doc) # Encoding of samples all_collected_feats = [doc.features for doc in all_docs] X_dict_vectorizer = DictVectorizer(sparse=True) encoded_features = X_dict_vectorizer.fit_transform(all_collected_feats) # Scale to increase performances and reduce training time scaler = preprocessing.StandardScaler( with_mean=False).fit(encoded_features) encoded_scaled_features = scaler.transform(encoded_features) # Encoding of labels label_encoder = preprocessing.LabelEncoder() label_encoder.fit([doc.label for doc in all_docs]) encoded_labels = label_encoder.transform( [doc.label for doc in all_docs]) # Classifier Algorithm clf = LinearSVC() # Cross validation cross_val_scores = cross_val_score(clf, encoded_scaled_features, encoded_labels, scoring='f1_weighted') print("Average F1 Weighted: %s" % (reduce(lambda x, y: x + y, cross_val_scores) / len(cross_val_scores), )) clf.fit(encoded_scaled_features, encoded_labels) # Save model joblib.dump(clf, 'clf.pkl') joblib.dump(scaler, "scaler.pkl") joblib.dump(label_encoder, "label_encoder.pkl") joblib.dump(X_dict_vectorizer, open("vectorizer.pkl", "wb")) tar = tarfile.open("%s" % model_name, "w") for fname in [ 'clf.pkl', "scaler.pkl", "label_encoder.pkl", "vectorizer.pkl" ]: tar.add(fname) os.remove(fname) tar.close() def evaluate_sentipolc(self, docs): def clz_to_opos_oneg(clz): if clz == "POS": opos = 1 oneg = 0 if clz == "NEG": opos = 0 oneg = 1 if clz == "O": opos = 0 oneg = 0 if clz == "POS_NEG": opos = 1 oneg = 1 return (opos, oneg) predicted_csv_file = open("predicted.csv", 'w') field_names = [ "id", "sub", "opos", "oneg", "iro", "lpos", "lneg", "top" ] writer = csv.DictWriter(predicted_csv_file, fieldnames=field_names) for doc in docs: opos, oneg = clz_to_opos_oneg(doc.labeled_prediction) writer.writerow({'id': doc.id, 'opos': opos, 'oneg': oneg}) predicted_csv_file.close() # Generate gold file gold_csv_file = open("gold.csv", 'w') writer = csv.DictWriter(gold_csv_file, fieldnames=field_names) for doc in docs: opos, oneg = clz_to_opos_oneg(doc.label) writer.writerow({'id': doc.id, 'opos': opos, 'oneg': oneg}) gold_csv_file.close() # Evaluation evaluate("gold.csv", "predicted.csv") def load_model(self, model_name): tar = tarfile.open("%s" % model_name, 'r') for tarinfo in tar: f = tar.extractfile(tarinfo) if tarinfo.name == "clf.pkl": self.classifier = joblib.load(f) if tarinfo.name == "scaler.pkl": self.scaler = joblib.load(f) if tarinfo.name == "label_encoder.pkl": self.label_encoder = joblib.load(f) if tarinfo.name == "vectorizer.pkl": self.vectorizer = joblib.load(f) def parse(self, input_file_name): reader = InputReader(input_file_name) all_docs = [] original_labels = [] predicted_labels = [] for doc in reader.generate_documents(): doc.features = self.extract_features(doc) all_docs.append(doc) # Encoding of samples encoded_features = self.vectorizer.transform(doc.features) encoded_scaled_features = self.scaler.transform(encoded_features) predictions = self.classifier.predict(encoded_scaled_features) labeled_prediction = self.label_encoder.inverse_transform( predictions)[0] original_labels.append(doc.label) predicted_labels.append(labeled_prediction) doc.labeled_prediction = labeled_prediction print( sklearn.metrics.classification_report(original_labels, predicted_labels)) self.evaluate_sentipolc(all_docs)