class CountAdjectives(TransformerMixin): """ adds postags, learns weights """ def __init__(self): super(CountAdjectives, self).__init__() self.tagger = PerceptronTagger(load=True) training_corpus = list(alpino.tagged_sents()) self.tagger.train(training_corpus) def postag(self, x): postagged = self.tagger.tag(x.split()) onlytags = [tt[1] for tt in postagged] return onlytags def count_adjectives(self, x): postagged = self.postag(x) totalcount = len(postagged) adjlength = postagged.count('adj') if adjlength > 0: return adjlength / totalcount return 0 def transform(self, X, y=None): new_X = [[self.count_adjectives(x)] for x in X] return new_X def fit(self, X, y=None): return self
def train_corpus_to_tag(): """ Train tagger on Alpino Corpus :return: model tagger <type: 'model'> """ alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) return tagger
def test_perceptron_tagger(self): tagger = PerceptronTagger(load=False) tagger.train(self.corpus) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(tagger.model.weights, decoded.model.weights) self.assertEqual(tagger.tagdict, decoded.tagdict) self.assertEqual(tagger.classes, decoded.classes)
def tagger(self): """ Usage: training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) tagger.train(training_corpus) #sent = 'NLTK is een goeda taal voor het leren over NLP'.split() print(tagger.tag(article_text.split())) :return: """ # Load Corpus training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) # Build tagger tagger.train(training_corpus) return tagger.tag(self.string.split())
def main(): training_corpus = list(alp.tagged_sents()) global tagger tagger = PerceptronTagger() tagger.train(training_corpus) num = 2138 dic = {} Xtrain = [] Ytrain = [] with open("trainGxG/GxG_News.txt") as txt: for line in txt: if line[0:8] == "<doc id=": Ytrain.append(line.split()[3][8]) string=[line.split('\"')[1]] dic[line.split('\"')[1]] = line.split()[3][8] elif line[0:6] == "</doc>": Xtrain.append(" ".join(string)) else: string.append(line) Xtest = [] with open("testGxG/GxG_News.txt") as txt: for line in txt: if line[0:8] == "<doc id=": string=[] elif "</doc>" in line: Xtest.append(" ".join(string)) else: string.append(line) Ytest = [] with open("testGxG/GxG_News_gold.txt") as text: for line in text: Ytest.append(line.split()[1]) sentences = [] for i in Xtrain[:num]: sentences.append(preprocess(i)) nlp = spacy.load('nl_core_news_sm') veclist = [] for sentence in sentences: doc = nlp(sentence) vec = doc.vector veclist.append(vec) X = np.array(veclist) clf = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None) labels = clf.fit_predict(X) pca = PCA(n_components=2).fit(X) coords = pca.transform(X) lst = [] for index, sentence in enumerate(sentences): plt.text(coords[index].tolist()[0],coords[index].tolist()[1], str(dic[sentence.split()[0]]) + str(labels[index]) + ":" + str(sentence)[0:10], fontsize=4) lst.append(str(dic[sentence.split()[0]]) + str(labels[index])) label_colors=["red", "blue", "green", "yellow", "black", "purple", "cyan"] colors = [label_colors[i] for i in labels] plt.scatter(coords[:, 0], coords[:, 1], c=colors) centroids = clf.cluster_centers_ centroid_coords = pca.transform(centroids) plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker="X", s=200, linewidth=2, c="#444d61") print(Counter(labels)) genders = [] for i,j in enumerate(sentences): if i < num: genders.append(dic[j.split()[0]]) print(Counter(genders)) print(Counter(lst)) plt.show()
def main(file_input): data_df = pd.read_csv(str(file_input) + '.csv') data_df = shuffle(data_df) print("Loaded .csv file Successfully") print("Total Number of Samples:", data_df.shape[0]) print("Total Number of Features:", data_df.shape[1]) # Missing Values # column with maximum missing values def missing_value(data_df): while data_df.isnull().sum().values.sum() != 0: col_with_missing_val = (data_df.isnull().sum()).argmax() data_df = data_df[data_df[col_with_missing_val].notnull( )] # drop corresponding rows that has NaN values print("Missing Values in Features:", col_with_missing_val) return data_df # Missing Value Treatment: print("Missing Value Treatment : Start") data_df = missing_value(data_df) print("Missing Value Treatment : Stop") print("Total Number of Samples:", data_df.shape[0]) print("Total Number of Features:", data_df.shape[1]) # pattern matcher for candidate feature # newly Added Features : Dates format, currency format, number of digits per candidate, number of separators # per candidate print("Computing Pattern Transformers: Start") pattern_strictlyDigits = "^[0-9]*$" pattern_endWithCharacters = "^\d*[\/.,@$!)(]$" # Only digits + end with special characters pattern_telephone = "^0[0-9]{12}$" pattern_vat = "^0?[0-9]{9}$" pattern_date = '^[0-3]?[0-9](\/|\,|\.|\-){1}[0-9]?[0-9](\/|\,|\.|\-){1}[0-2][0-9]{1,3}$' pattern_currency_1 = '^[0-9]\.[0-9]+\,[0-9]*$' # captures ddddd,dddd pattern_currency_2 = '^[0-9]+\,[0-9]+$' data_df['currency_filter'] = data_df['candidate'].str.contains(pattern_currency_1, regex=True).astype(np.int64)\ | data_df['candidate'].str.contains(pattern_currency_2, regex=True).astype(np.int64) data_df['dates_filter'] = data_df['candidate'].str.contains( pattern_date, regex=True).astype(np.int64) data_df["Is_strictly_Digits"] = data_df["candidate"].str.contains( pattern_strictlyDigits, regex=True).astype(np.int64) data_df["endWithCharacters"] = data_df["candidate"].str.contains( pattern_endWithCharacters, regex=True).astype(np.int64) data_df["Number_of_Digits"] = data_df['candidate'].apply( lambda x: len(re.sub("\W", "", x))) data_df["Number_of_Separators"] = data_df['candidate'].apply( lambda x: len(re.sub("\w", "", x))) data_df["Length_of_Candidate"] = data_df['candidate'].apply( lambda x: len(x)) # included the country code data_df["Telephone"] = data_df["candidate"].str.contains( pattern_telephone, regex=True).astype(np.int64) # VAT number contains 9 to 10 digits data_df["VATNumber"] = data_df["candidate"].str.contains( pattern_vat, regex=True).astype(np.int64) # drop blacklisted variables dates_index = data_df.index[data_df['dates_filter'] == 1].tolist() data_df = data_df.drop(index=dates_index, axis=0) data_df = data_df.drop("dates_filter", axis=1) currency_index = data_df.index[data_df['currency_filter'] == 1].tolist() data_df = data_df.drop(index=currency_index, axis=0) data_df = data_df.drop(["currency_filter"], axis=1) telephone_index = data_df.index[data_df['Telephone'] == 1].tolist() data_df = data_df.drop(index=telephone_index, axis=0) data_df = data_df.drop(["Telephone"], axis=1) vat_index = data_df.index[data_df['VATNumber'] == 1].tolist() data_df = data_df.drop(index=vat_index, axis=0) data_df = data_df.drop(["VATNumber"], axis=1) vat_index = data_df.index[data_df['endWithCharacters'] == 1].tolist() data_df = data_df.drop(index=vat_index, axis=0) data_df = data_df.drop(["endWithCharacters"], axis=1) print("Computing Pattern Transformers: Stop") # NLP Techniques: # Tokenization, Stemming, lemmatization, Frequency Distribution, Bag of words approach # Combine three text columns to single column - This columns contains he full text data_df["Text"] = data_df["line_before"] + data_df["line_at"] + data_df[ "line_after"] print("Computing Context Transformers: Start") # Context Transformers def email_match(doc): match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc)) if match != None: return 1 else: return 0 data_df["Number_of_Characters_Text"] = data_df["Text"].apply( lambda x: len(re.sub("[^a-z]", "", str(x)))) data_df["Number_of_Digits_Text"] = data_df["Text"].apply( lambda x: len(re.sub("[^0-9]+", "", str(x)))) data_df["Number_of_Separators_Text"] = data_df["Text"].apply(lambda x: len( (re.sub("[\w]+", "", str(x))).replace(" ", ""))) data_df["Is_Email_Exists"] = data_df["Text"].apply( email_match) # place 1 everywhere email found else 0 data_df["Number_of_spaces"] = data_df["Text"].apply( lambda x: str(x).count(' ')) # counts number of spaces, # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language ss = SnowballStemmer("dutch", "french") def clean_data(doc): ignore = list(set(stopwords.words( 'dutch', 'french'))) # ignore the list of stopwords exl_chars = list(set(string.punctuation)) exl_chars.append('€') # remove email ids to avoid conflicts in vocabulary construction doc = re.sub("[\w\.-]+@[\w\.-]+", " ", str(doc)) doc = re.sub("\d", " ", str(doc)) doc = ''.join([ch for ch in doc if ch not in exl_chars]) words = [] for i in word_tokenize(doc): # tokenization if i not in ignore: if len(i) >= 2: # standalone letters do not add any value i = ss.stem(i) words.append(i) doc = ' '.join(list(set(words))) return doc print("Cleaning Text Data: Start") data_df["Text"] = data_df["Text"].apply( clean_data) # tokenize, stem and lammetize print("Cleaning Text Data: Stop") print("Computing POS Vectors: Start") # training_corpus = alp.tagged_sents() alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) def count_adj(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_adj_adv = counts['adv'] + counts['adj'] return count_adj_adv def count_nn(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_nn = counts['noun'] return count_nn def count_verb(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_verb = counts['verb'] return count_verb data_df["Adv_Adj_Count"] = data_df["Text"].apply(count_adj) data_df["NN_count"] = data_df["Text"].apply(count_nn) data_df["Verb_count"] = data_df["Text"].apply(count_verb) print("Computing POS Vectors: Stop") print("Computing Vocabulary: Start") # store all the words in positive class and negative in two separate lists docs_pos = [] docs_pos.extend( word_tokenize(words) for words in data_df.Text[data_df.gold == 1]) docs_pos = list(itertools.chain(*docs_pos)) # Clean text data - remove words like --- iiiiiii, hhhhhccchhhh, abvwwwwwcgdccc for i in docs_pos: first_3_characters = i[:3] last_3_characters = i[-3:] if len(i) >= 3 and first_3_characters[0] == first_3_characters[ 1] == first_3_characters[2]: docs_pos.remove(i) if i in docs_pos and len(i) >= 3 and last_3_characters[ 0] == last_3_characters[1] == last_3_characters[2]: docs_pos.remove(i) print("Positve class words are stored successfully") all_words_pos = nltk.FreqDist(docs_pos) print("Computing vocabulary based on Positive Class") # find popular words, popular equals more than 25 times in the corpus popular_pos_words = [] for i in all_words_pos.items(): if i[1] >= 25: popular_pos_words.append(i[0]) # Filter nouns from the popular positive class words tagged_pos_words = tagger.tag(popular_pos_words) filtered_tag_pos_words_nouns = [] for word in tagged_pos_words: if word[1] == 'noun': filtered_tag_pos_words_nouns.append(word[0]) vocab_pos = list(set(filtered_tag_pos_words_nouns)) vocabulary = list(set(vocab_pos)) # save vocabulary with open("vocab.txt", "wb") as fp: pickle.dump(vocabulary, fp) print("Computing Vocabulary: Stop") print("Length of Vocabulary: ", len(vocabulary)) print("Computing Bag of Words Vectors: Start") def build_features(doc): vector = np.zeros((1, len(vocabulary)), dtype=np.int64) for w in word_tokenize(doc): for idx, vocab in enumerate(vocabulary): if vocab == w: vector[0][idx] += 1 return vector bag_vectors = data_df["Text"].apply(build_features) feature_vectors = np.zeros((data_df.shape[0], len(vocabulary)), dtype=np.int64) for pos, index in enumerate(data_df.index.values): feature_vectors[pos, :] = bag_vectors[index] cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))] for col_index, col in enumerate(cols): data_df[col] = feature_vectors[:, col_index].reshape(data_df.shape[0], 1) print("Computing Bag of Words Vectors: Stop") print("Computing Context Transformers: Stop") print("Computing Location Transformers: Start") data_df["location_page_nr"] = data_df["page_nr"].apply(lambda x: 100 if x >= 50 else x) data_df["location_line_nr"] = data_df["line_nr"].apply(lambda x: 100 if x >= 50 else x) print("Computing Location Transformers: Stop") print("Total Number of Newly Added Features:", data_df.shape[1] - 7) print("Building ML - Neural Network Model: Start") X = data_df.drop([ "candidate", "Text", "gold", "label", "line_after", "line_at", "line_before", "line_nr", "page_nr" ], axis=1) y = data_df.gold # Normalisation X = (X - X.mean(axis=0)) / X.std(axis=0) def build_model(input_shape): model = Sequential() model.add(Dense(1024, input_shape=(input_shape, ))) model.add(Activation('sigmoid')) model.add(Dense(512)) model.add(Activation('sigmoid')) model.add(Dense(128)) model.add(Activation('sigmoid')) model.add(Dense(1, activation="sigmoid")) model.compile(optimizer='adam', loss=tf.keras.losses.mean_squared_error, metrics=['accuracy']) return model # Stratified k-Fold k_fold_outer = model_selection.StratifiedKFold(n_splits=5) scores = [] split = 0 for train_index, test_index in k_fold_outer.split(X, y): X_train, X_val = X.iloc[train_index], X.iloc[test_index] y_train, y_val = y.iloc[train_index], y.iloc[test_index] model = build_model(X_train.shape[1]) history = model.fit(X_train, y_train, epochs=5, batch_size=1024, verbose=1) results = model.evaluate(X_val, y_val) scores.append(results[1]) split += 1 del model, history, results model = build_model(X.shape[1]) model.fit(X, y, verbose=0) print('Saving the Model *.h5...') model.save('model_candidate_filter.h5') yHat_proba = model.predict(X) yHat = np.copy(yHat_proba) yHat[yHat <= 0.5] = 0 yHat[yHat > 0.5] = 1 br_score = np.around(metrics.brier_score_loss(y, yHat_proba, pos_label=1), decimals=5) print("Storing Results in .csv file") confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1])) for i in range(0, yHat_proba.shape[0]): if yHat_proba[i] <= 0.5: confidence[i] = 1 - yHat_proba[i] else: confidence[i] = yHat_proba[i] results_data_frame = pd.DataFrame( columns=["Predictions", "Confidence Level"], index=data_df.index) results_data_frame["Predictions"] = yHat.astype(np.int64).ravel() results_data_frame["Confidence Level"] = np.around(confidence, decimals=4) results_data_frame.to_csv("Results_predictions_confidence_train.csv", encoding='utf-8', header=True, index=True) return np.mean(scores), br_score
def main(file_input): test_data = pd.read_csv(str(file_input) + '.csv') # test_data = pd.read_csv(str(file_input) + '.csv', index_col='Unnamed: 0') print("Loaded .csv file Successfully") print("Missing Value Treatment : Start") # missing values Treatment while test_data.isnull().sum().values.sum() != 0: col_with_missing_val = (test_data.isnull().sum()).argmax() test_data = test_data[test_data[col_with_missing_val].notnull( )] # drop corresponding rows that has NaN values print(col_with_missing_val) print("Missing Value Treatment : Stop") print("Total Number of Samples:", test_data.shape[0]) print("Total Number of Features:", test_data.shape[1]) print("Computing Pattern Transformers: Start") # pattern transformers pattern_strictlyDigits = "^[0-9]*$" test_data["strictly_Digits"] = test_data["candidate"].str.contains( pattern_strictlyDigits, regex=True).astype(np.int64) test_data["Number_of_Digits"] = test_data['candidate'].apply( lambda x: len(re.sub("\W", "", x))) test_data["Number_of_Seprators"] = test_data['candidate'].apply( lambda x: len(re.sub("\w", "", x))) test_data["Length_of_Candidate"] = test_data['candidate'].apply( lambda x: len(x)) print("Computing Pattern Transformers: Stop") print("Computing Context Transformers: Start") # context transformers test_data["Text"] = test_data["line_before"] + test_data[ "line_at"] + test_data["line_after"] def email_match(doc): match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc)) if match != None: return 1 else: return 0 test_data["Number_of_Characters_Text"] = test_data["Text"].apply( lambda x: len(re.sub("[^a-z]", "", str(x)))) test_data["Number_of_Digits_Text"] = test_data["Text"].apply( lambda x: len(re.sub("[^0-9]+", "", str(x)))) test_data["Number_of_Separators_Text"] = test_data["Text"].apply( lambda x: len((re.sub("[\w]+", "", str(x))).replace(" ", ""))) test_data["Email_Exists"] = test_data["Text"].apply( email_match) # place 1 everywhere email found else 0 test_data["Number_of_spaces"] = test_data["Text"].apply( lambda x: str(x).count(' ')) # counts number of spaces # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language ss = SnowballStemmer("dutch", "french") def clean_data(doc): ignore = list(set(stopwords.words( 'dutch', 'french'))) # ignore the list of stopwords exl_chars = list(set(string.punctuation)) exl_chars.append('€') doc = re.sub( "[\w\.-]+@[\w\.-]+", " ", str(doc) ) # remove email ids to avoid confiltcs in vaocabulary construction doc = re.sub("\d", " ", str(doc)) doc = ''.join([ch for ch in doc if ch not in exl_chars]) words = [] for i in word_tokenize(doc): # tokenization if i not in ignore: if len(i) >= 2: # standalone letters do not add any value i = ss.stem(i) words.append(i) doc = ' '.join(list(set(words))) return doc test_data["Text"] = test_data["Text"].apply( clean_data) # tokenize, stem and lammetize # training_corpus = alp.tagged_sents() alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) def count_adj(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_adj_adv = counts['adv'] + counts['adj'] return count_adj_adv def count_nn(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_nn = counts['noun'] return count_nn def count_verb(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_verb = counts['verb'] return count_verb test_data["Adv_Adj_Count"] = test_data["Text"].apply(count_adj) test_data["NN_count"] = test_data["Text"].apply(count_nn) test_data["Verb_count"] = test_data["Text"].apply(count_verb) print("Computing Context Transformers: Stop") # load the vocabulary with open("vocab.txt", "rb") as fp: vocabulary = pickle.load(fp) print("Computing Bag of Words Vectors: Start") def build_features(doc): vector = np.zeros((1, len(vocabulary)), dtype=np.int64) for w in word_tokenize(doc): for i, word in enumerate(vocabulary): if word == w: vector[0][i] += 1 return vector bag_vectors = test_data["Text"].apply(build_features) feature_vectors = np.zeros((test_data.shape[0], len(vocabulary)), dtype=np.int64) for pos, index in enumerate(test_data.index.values): feature_vectors[pos, :] = bag_vectors[index] cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))] for col_index, col in enumerate(cols): test_data[col] = feature_vectors[:, col_index].reshape( test_data.shape[0], 1) print("Computing Bag of Words Vectors: Stop") print("Computing Location Transformers: Start") test_data["location_page_nr"] = test_data["page_nr"].apply( lambda x: 100 if x >= 50 else x) test_data["location_line_nr"] = test_data["line_nr"].apply( lambda x: 100 if x >= 50 else x) print("Computing Location Transformers: Stop") print("Loading Model...") model = tf.keras.models.load_model('model_candidate_filter.h5') model.compile(loss=tf.keras.losses.mean_squared_error, optimizer='adam', metrics=['accuracy']) print("Loaded Model Successfully!") X_test = test_data.drop([ "candidate", "Text", "label", "line_after", "line_at", "line_before", "page_nr", "line_nr" ], axis=1) X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0) yHat_proba = model.predict(X_test) yHat = np.copy(yHat_proba) yHat[yHat <= 0.5] = 0 yHat[yHat > 0.5] = 1 print("Storing Results in .csv file") confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1])) for i in range(0, yHat_proba.shape[0]): if yHat_proba[i] <= 0.5: confidence[i] = 1 - yHat_proba[i] else: confidence[i] = yHat_proba[i] results_data_frame = pd.DataFrame( columns=["Predictions", "Confidence Level"], index=test_data.index) results_data_frame["Predictions"] = yHat.astype(np.int64).ravel() results_data_frame["Confidence Level"] = np.around(confidence, decimals=4) results_data_frame.to_csv("Results_predictions_confidence_run.csv", encoding='utf-8', header=True, index=True)
import numpy as np import torch from torch.autograd import Variable import pickle from collections import Counter from torch import nn import torch.nn.functional as F from nltk.tag import PerceptronTagger from nltk.corpus import alpino as alp from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import PunktSentenceTokenizer training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) tagger.train(training_corpus) wordTokenizer = WordPunctTokenizer() sentTokenizer = PunktSentenceTokenizer() def generate_vocabulary(data, vocabulary_size): all_data = " ".join(data) print(all_data[:100]) words = [ word for sent in sentTokenizer.tokenize(all_data) for word in wordTokenizer.tokenize(sent) ] counter = Counter(words) # most_common() produces k frequently encountered # input values and their respective counts. most_common = counter.most_common(vocabulary_size) vocabulary = set([word for word, count in most_common])
from nltk import word_tokenize from nltk.tag import PerceptronTagger from nltk.corpus import conll2000 as cn import pickle import time train = cn.tagged_sents("train.txt") test = cn.tagged_sents("test.txt") pt = PerceptronTagger(load=False) sts=int(time.time()) pt.train(list(train),nr_iter=10) fts=int(time.time()) pts=fts-sts print pts f = open('ptagger.pickle', 'wb') pickle.dump(pt, f) f.close()
def run_test(my_corpus): if my_corpus == treebank: print 'Corpus Info:' print ' Corpus: treebank' print ' Tagged Sents:', len(my_corpus.tagged_sents()) print ' Tagged Words:', len(my_corpus.tagged_words()) my_tagged_sents = my_corpus.tagged_sents() my_sents = my_corpus.sents() elif my_corpus == brown: print 'Corpus Info:' print ' Corpus: brown' print ' Tagged Sents:', len(my_corpus.tagged_sents()) print ' Tagged Words:', len(my_corpus.tagged_words()) print ' Tagged Sents (news):', len( my_corpus.tagged_sents(categories='news')) print ' Tagged Words (news):', len( my_corpus.tagged_words(categories='news')) my_tagged_sents = my_corpus.tagged_sents(categories='news') my_sents = my_corpus.sents(categories='news') #print ' Tagged Sents :', len(my_corpus.tagged_sents()) #print ' Tagged Words :', len(my_corpus.tagged_words()) #my_tagged_sents = my_corpus.tagged_sents() #my_sents = my_corpus.sents() else: return fold = 5 print 'Performing', fold, 'fold cross validation on corpus ...' train_accuracy = [] test_accuracy = [] train_runtime = [] test_runtime = [] for k in range(fold): train_data = [ x for i, x in enumerate(my_tagged_sents) if i % fold != k ] validation_data = [ x for i, x in enumerate(my_tagged_sents) if i % fold == k ] #test_data = [x for i, x in enumerate(my_sents) if i % fold == k] print 'Fold', k, ' has', len(train_data), 'train sentences and', len( validation_data), 'test sentences' perceptron_pos_tagger = PerceptronTagger(load=False) begin = time.time() perceptron_pos_tagger.train(train_data) end = time.time() train_acc = perceptron_pos_tagger.evaluate(train_data) train_accuracy.append(train_acc) train_runtime.append(end - begin) print ' Train accuracy =', train_acc, ' runtime =', end - begin begin = time.time() test_acc = perceptron_pos_tagger.evaluate(validation_data) end = time.time() test_accuracy.append(test_acc) test_runtime.append(end - begin) print ' Test accuracy =', test_acc, ' runtime =', end - begin print 'Results:' print '%15s %15s %15s %15s %15s' % ('Fold', 'Train-Accuracy', 'Train-Runtime', 'Test-Accuracy', 'Test-Runtime') for k in range(fold): print '%15d %15.3f%% %15.5f %15.3f%% %15.5f' % ( k, train_accuracy[k] * 100, train_runtime[k], test_accuracy[k] * 100, test_runtime[k]) avg_train_acc = sum(train_accuracy) / len(train_accuracy) avg_train_runtime = sum(train_runtime) / len(train_runtime) avg_test_acc = sum(test_accuracy) / len(test_accuracy) avg_test_runtime = sum(test_runtime) / len(test_runtime) print '%15s %15.3f%% %15.5f %15.3f%% %15.5f' % ( 'Average', avg_train_acc * 100, avg_train_runtime, avg_test_acc * 100, avg_test_runtime) return
def perceptron_tagger(train_data): tagger = PerceptronTagger(load=False) tagger.train(train_data) return tagger