def common_description(self, s0, s1): tagger = PerceptronTagger() s0_tags = tagger.tag(s0) s1_tags = tagger.tag(s1) total_dist = 0 for word, tag in s0_tags: if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'): max_dist = 0 for synset in wn.synsets(word, self.penn_to_wn(tag)): desc = word_tokenize(synset.definition()) dist = len(list(set(s1) & set(desc))) if dist > max_dist: max_dist = dist total_dist += max_dist for word, tag in s1_tags: if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'): max_dist = 0 for synset in wn.synsets(word, self.penn_to_wn(tag)): desc = word_tokenize(synset.definition()) dist = len(list(set(s0) & set(desc))) if dist > max_dist: max_dist = dist total_dist += max_dist return total_dist
def smaller_subtree_containing_the_drugs(sentence, target_drugs): tree_string = nlp.annotate(sentence, properties={ 'annotators': 'parse', 'outputFormat': 'json' }) tagger = PerceptronTagger() best_subtree = None size = 9999999 target_drugs = [dr for drug in target_drugs for dr in drug.split(' ')] for s in tree_string['sentences']: tree_parsed = Tree.fromstring(s['parse']) for subtree in tree_parsed.subtrees(): # print(subtree.pretty_print()) leafs = subtree.leaves() current_size = len(leafs) if all_drugs_in_tree(target_drugs, leafs): if current_size < size: best_subtree = subtree size = current_size # print(subtree.leaves()) try: clean = clean_sentence(best_subtree.leaves()) except: clean = clean_sentence(sentence.split()) # print('clean',clean) tagged = tagger.tag(clean) # print('tag:', tagged) lemmatized = preprocessor_lemmatize(tagged) # print('lemmatized', lemmatized) new_sentence = ' '.join([l for l, t in lemmatized]) return new_sentence
def count_common_propper_nouns(self, s0, s1): tagger = PerceptronTagger() s0_tags = tagger.tag(s0) s1_tags = tagger.tag(s1) NNP_s0 = [values[0] for values in s0_tags if values[1] == 'NNP'] NNP_s1 = [values[0] for values in s1_tags if values[1] == 'NNP'] return len(set(NNP_s0) & set(NNP_s1))
def CorpusListPhrase(self, matrix, stopwords): phrase_list = [] grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ tagger = PerceptronTagger() pos_tag = tagger.tag # Create phrase tree chunker = nltk.RegexpParser(grammar) for doc in matrix: phrase = self._flatten([ word for word in self._getTerms( chunker.parse(pos_tag(re.findall(r'\w+', str(doc))))) if word not in stopwords ]) phrase_list.append(",".join(phrase)) return phrase_list
def extract_tokens(row, lemmatize=True, use_tag=True): tokenizer = WhitespaceTokenizer() if lemmatize: # reduce words to lemmas pattern = '[().*+,?!\'\";:]*' token_list = list() if use_tag: # use POS tags to obtain more accurate lemmas pos_tags = PerceptronTagger().tag(tokenizer.tokenize(row['text'])) lemmatizer_input = map( lambda x: (x[0], nltk_to_wordnet.get(x[1][0])), pos_tags) lemmatizer = WordNetLemmatizer() for word, tag in lemmatizer_input: if word != 'urlLink' and 'http:' not in word: word = word.lower() if tag is None: tok = lemmatizer.lemmatize(word) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: tok = lemmatizer.lemmatize(word, tag) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: # do not use a tagger if not specified and speed up computation lemmatizer_input = tokenizer.tokenize(row['text']) lemmatizer = WordNetLemmatizer() for word in lemmatizer_input: if word != 'urlLink' and 'http:' not in word: tok = lemmatizer.lemmatize(word.lower()) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: # simply tokenize based on whitespaces token_list = tokenizer.tokenize(row['text']) return token_list
def train_corpus_to_tag(): """ Train tagger on Alpino Corpus :return: model tagger <type: 'model'> """ alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) return tagger
def __init__(self, job_title_col, url_col, description_col, label_col, word_col, encoded_job_title_col, indeed_file, words_file): ''' Parameters ---------- job_title_col: str. column name that contains the job titles of the job postings url_col: str. column name that contains the urls of the job postings description_col: str. column name that contains the job descriptions of the job postings label_col: str. column name that contains the job group in set {"Data Scientist", "Machine Learning Engineer", "Data Engineer","Data Analyst", "None"} word_col: str. column name that contains the hard skills encoded_job_title_col: str. column name that contains the encoded job group df_indeed: pandas df. the dataframe with the scraped job postings df_words: pandas df. the dataframe with the hard skills ''' #intialize attributes related to dataset self.job_title_col = job_title_col self.url_col = url_col self.description_col = description_col self.label_col = label_col self.word_col = word_col self.encoded_job_title_col = encoded_job_title_col #load the scraped files self.df_indeed = self._load_data(indeed_file) self.df_words = self._load_data(words_file) #initialize attributes related to extracted features self.job_description = None self.word_list = None self.features_list_single = [] self.features_list_phrase = [] self.topk_single = None self.topk_phrase = None self.topk_full = None self.df_single = pd.DataFrame() self.df_phrase = pd.DataFrame() self.df = pd.DataFrame() self.df_tools = pd.DataFrame() self.df = pd.DataFrame() self.top_tools_dict = {} # Initialize attributes related to keyphrase extraction self.grammar = self._initialize_grammar() self.stop = self._initialize_stopwords() self.text = """ initialize """ self.tagger = PerceptronTagger() self.pos_tag = self.tagger.tag self.chunker = nltk.RegexpParser(self.grammar) self.taggedToks = self.pos_tag(re.findall(r'\w+', self.text)) self.tree = self.chunker.parse(self.taggedToks) #perform pre-processing, feature extraction and post-processing self._execute_pre_processing() self._execute_feature_extraction() self._execute_post_processing()
def test_perceptron_tagger(self): tagger = PerceptronTagger(load=False) tagger.train(self.corpus) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(tagger.model.weights, decoded.model.weights) self.assertEqual(tagger.tagdict, decoded.tagdict) self.assertEqual(tagger.classes, decoded.classes)
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn)
def status(): from autogoal.contrib import ContribStatus try: from nltk.corpus import wordnet from nltk.corpus import sentiwordnet from nltk.corpus import stopwords from nltk.stem import RSLPStemmer st = RSLPStemmer() from nltk.tag import PerceptronTagger tagger = PerceptronTagger() except LookupError: return ContribStatus.RequiresDownload return ContribStatus.Ready
def train_tagger(language, model_type, feature, train_sents): if model_type == 'unigram': tagger = UnigramTagger(train_sents) elif model_type == 'bigram': tagger = BigramTagger(train_sents) elif model_type == 'trigram': tagger = TrigramTagger(train_sents) elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) elif model_type == 'crf': tagger = CRFTagger() tagger.train(train_sents, 'taggers/{0}/{1}/crf.pickle'.format(language, feature)) elif model_type == 'perceptron': tagger = PerceptronTagger(load=False) tagger.train(train_sents) return tagger
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__) + '/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn)
def __init__(self, language, stanford=False): if not language: raise ValueError("No language specified for POS tagging") else: self._language = language if self._language == "eng" and stanford: self.model = StanfordPOSTagger( r'english-bidirectional-distsim.tagger') self.tagger = self.model.tag elif self._language == "eng": try: # "new" nltk with slow default behaviour through high-level API from nltk.tag import PerceptronTagger self.model = PerceptronTagger() self.tagger = self.model.tag except ImportError: self.model = None self.tagger = nltk.pos_tag elif self._language == "afr": self.model = HunposTagger(join(_MODEL_DIR, "pos-tag-model.af"), encoding='utf-8') self.tagger = self.model.tag elif self._language == "nso": self.model = HunposTagger(join(_MODEL_DIR, "simple-pos-tag-model.nso"), encoding='utf-8') self.tagger = self.model.tag elif self._language == "zul": #self.model = MarmotTagger(encoding='utf-8') self.model = HunposTagger(join(_MODEL_DIR, "simple-pos-tag-model.zu"), encoding='utf-8') self.tagger = self.model.tag else: raise ValueError( 'Language "%s" not supported for POS tagging.\nSupply a 3 letter code form ISO-639.' % self._language)
def __init__(self, df, review_col, truth_col, copy=True, analyzer=None, stop_words=stopwords.words('english'), pos_tag=PerceptronTagger().tag, parse=RegexpParser(grammar).parse, lemmatize=WordNetLemmatizer().lemmatize): # DataFrame stuffs self.df = df.copy() if copy else df self.review_col = review_col self.truth_col = truth_col # NLP stuffs self.analyzer = self.vader if analyzer is None else analyzer self.stop_words = stop_words self.pos_tag = pos_tag self.parse = parse self.lemmatize = lemmatize
def tagger(self): """ Usage: training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) tagger.train(training_corpus) #sent = 'NLTK is een goeda taal voor het leren over NLP'.split() print(tagger.tag(article_text.split())) :return: """ # Load Corpus training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) # Build tagger tagger.train(training_corpus) return tagger.tag(self.string.split())
def main(): training_corpus = list(alp.tagged_sents()) global tagger tagger = PerceptronTagger() tagger.train(training_corpus) num = 2138 dic = {} Xtrain = [] Ytrain = [] with open("trainGxG/GxG_News.txt") as txt: for line in txt: if line[0:8] == "<doc id=": Ytrain.append(line.split()[3][8]) string=[line.split('\"')[1]] dic[line.split('\"')[1]] = line.split()[3][8] elif line[0:6] == "</doc>": Xtrain.append(" ".join(string)) else: string.append(line) Xtest = [] with open("testGxG/GxG_News.txt") as txt: for line in txt: if line[0:8] == "<doc id=": string=[] elif "</doc>" in line: Xtest.append(" ".join(string)) else: string.append(line) Ytest = [] with open("testGxG/GxG_News_gold.txt") as text: for line in text: Ytest.append(line.split()[1]) sentences = [] for i in Xtrain[:num]: sentences.append(preprocess(i)) nlp = spacy.load('nl_core_news_sm') veclist = [] for sentence in sentences: doc = nlp(sentence) vec = doc.vector veclist.append(vec) X = np.array(veclist) clf = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None) labels = clf.fit_predict(X) pca = PCA(n_components=2).fit(X) coords = pca.transform(X) lst = [] for index, sentence in enumerate(sentences): plt.text(coords[index].tolist()[0],coords[index].tolist()[1], str(dic[sentence.split()[0]]) + str(labels[index]) + ":" + str(sentence)[0:10], fontsize=4) lst.append(str(dic[sentence.split()[0]]) + str(labels[index])) label_colors=["red", "blue", "green", "yellow", "black", "purple", "cyan"] colors = [label_colors[i] for i in labels] plt.scatter(coords[:, 0], coords[:, 1], c=colors) centroids = clf.cluster_centers_ centroid_coords = pca.transform(centroids) plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker="X", s=200, linewidth=2, c="#444d61") print(Counter(labels)) genders = [] for i,j in enumerate(sentences): if i < num: genders.append(dic[j.split()[0]]) print(Counter(genders)) print(Counter(lst)) plt.show()
def main(file_input): data_df = pd.read_csv(str(file_input) + '.csv') data_df = shuffle(data_df) print("Loaded .csv file Successfully") print("Total Number of Samples:", data_df.shape[0]) print("Total Number of Features:", data_df.shape[1]) # Missing Values # column with maximum missing values def missing_value(data_df): while data_df.isnull().sum().values.sum() != 0: col_with_missing_val = (data_df.isnull().sum()).argmax() data_df = data_df[data_df[col_with_missing_val].notnull( )] # drop corresponding rows that has NaN values print("Missing Values in Features:", col_with_missing_val) return data_df # Missing Value Treatment: print("Missing Value Treatment : Start") data_df = missing_value(data_df) print("Missing Value Treatment : Stop") print("Total Number of Samples:", data_df.shape[0]) print("Total Number of Features:", data_df.shape[1]) # pattern matcher for candidate feature # newly Added Features : Dates format, currency format, number of digits per candidate, number of separators # per candidate print("Computing Pattern Transformers: Start") pattern_strictlyDigits = "^[0-9]*$" pattern_endWithCharacters = "^\d*[\/.,@$!)(]$" # Only digits + end with special characters pattern_telephone = "^0[0-9]{12}$" pattern_vat = "^0?[0-9]{9}$" pattern_date = '^[0-3]?[0-9](\/|\,|\.|\-){1}[0-9]?[0-9](\/|\,|\.|\-){1}[0-2][0-9]{1,3}$' pattern_currency_1 = '^[0-9]\.[0-9]+\,[0-9]*$' # captures ddddd,dddd pattern_currency_2 = '^[0-9]+\,[0-9]+$' data_df['currency_filter'] = data_df['candidate'].str.contains(pattern_currency_1, regex=True).astype(np.int64)\ | data_df['candidate'].str.contains(pattern_currency_2, regex=True).astype(np.int64) data_df['dates_filter'] = data_df['candidate'].str.contains( pattern_date, regex=True).astype(np.int64) data_df["Is_strictly_Digits"] = data_df["candidate"].str.contains( pattern_strictlyDigits, regex=True).astype(np.int64) data_df["endWithCharacters"] = data_df["candidate"].str.contains( pattern_endWithCharacters, regex=True).astype(np.int64) data_df["Number_of_Digits"] = data_df['candidate'].apply( lambda x: len(re.sub("\W", "", x))) data_df["Number_of_Separators"] = data_df['candidate'].apply( lambda x: len(re.sub("\w", "", x))) data_df["Length_of_Candidate"] = data_df['candidate'].apply( lambda x: len(x)) # included the country code data_df["Telephone"] = data_df["candidate"].str.contains( pattern_telephone, regex=True).astype(np.int64) # VAT number contains 9 to 10 digits data_df["VATNumber"] = data_df["candidate"].str.contains( pattern_vat, regex=True).astype(np.int64) # drop blacklisted variables dates_index = data_df.index[data_df['dates_filter'] == 1].tolist() data_df = data_df.drop(index=dates_index, axis=0) data_df = data_df.drop("dates_filter", axis=1) currency_index = data_df.index[data_df['currency_filter'] == 1].tolist() data_df = data_df.drop(index=currency_index, axis=0) data_df = data_df.drop(["currency_filter"], axis=1) telephone_index = data_df.index[data_df['Telephone'] == 1].tolist() data_df = data_df.drop(index=telephone_index, axis=0) data_df = data_df.drop(["Telephone"], axis=1) vat_index = data_df.index[data_df['VATNumber'] == 1].tolist() data_df = data_df.drop(index=vat_index, axis=0) data_df = data_df.drop(["VATNumber"], axis=1) vat_index = data_df.index[data_df['endWithCharacters'] == 1].tolist() data_df = data_df.drop(index=vat_index, axis=0) data_df = data_df.drop(["endWithCharacters"], axis=1) print("Computing Pattern Transformers: Stop") # NLP Techniques: # Tokenization, Stemming, lemmatization, Frequency Distribution, Bag of words approach # Combine three text columns to single column - This columns contains he full text data_df["Text"] = data_df["line_before"] + data_df["line_at"] + data_df[ "line_after"] print("Computing Context Transformers: Start") # Context Transformers def email_match(doc): match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc)) if match != None: return 1 else: return 0 data_df["Number_of_Characters_Text"] = data_df["Text"].apply( lambda x: len(re.sub("[^a-z]", "", str(x)))) data_df["Number_of_Digits_Text"] = data_df["Text"].apply( lambda x: len(re.sub("[^0-9]+", "", str(x)))) data_df["Number_of_Separators_Text"] = data_df["Text"].apply(lambda x: len( (re.sub("[\w]+", "", str(x))).replace(" ", ""))) data_df["Is_Email_Exists"] = data_df["Text"].apply( email_match) # place 1 everywhere email found else 0 data_df["Number_of_spaces"] = data_df["Text"].apply( lambda x: str(x).count(' ')) # counts number of spaces, # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language ss = SnowballStemmer("dutch", "french") def clean_data(doc): ignore = list(set(stopwords.words( 'dutch', 'french'))) # ignore the list of stopwords exl_chars = list(set(string.punctuation)) exl_chars.append('€') # remove email ids to avoid conflicts in vocabulary construction doc = re.sub("[\w\.-]+@[\w\.-]+", " ", str(doc)) doc = re.sub("\d", " ", str(doc)) doc = ''.join([ch for ch in doc if ch not in exl_chars]) words = [] for i in word_tokenize(doc): # tokenization if i not in ignore: if len(i) >= 2: # standalone letters do not add any value i = ss.stem(i) words.append(i) doc = ' '.join(list(set(words))) return doc print("Cleaning Text Data: Start") data_df["Text"] = data_df["Text"].apply( clean_data) # tokenize, stem and lammetize print("Cleaning Text Data: Stop") print("Computing POS Vectors: Start") # training_corpus = alp.tagged_sents() alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) def count_adj(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_adj_adv = counts['adv'] + counts['adj'] return count_adj_adv def count_nn(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_nn = counts['noun'] return count_nn def count_verb(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_verb = counts['verb'] return count_verb data_df["Adv_Adj_Count"] = data_df["Text"].apply(count_adj) data_df["NN_count"] = data_df["Text"].apply(count_nn) data_df["Verb_count"] = data_df["Text"].apply(count_verb) print("Computing POS Vectors: Stop") print("Computing Vocabulary: Start") # store all the words in positive class and negative in two separate lists docs_pos = [] docs_pos.extend( word_tokenize(words) for words in data_df.Text[data_df.gold == 1]) docs_pos = list(itertools.chain(*docs_pos)) # Clean text data - remove words like --- iiiiiii, hhhhhccchhhh, abvwwwwwcgdccc for i in docs_pos: first_3_characters = i[:3] last_3_characters = i[-3:] if len(i) >= 3 and first_3_characters[0] == first_3_characters[ 1] == first_3_characters[2]: docs_pos.remove(i) if i in docs_pos and len(i) >= 3 and last_3_characters[ 0] == last_3_characters[1] == last_3_characters[2]: docs_pos.remove(i) print("Positve class words are stored successfully") all_words_pos = nltk.FreqDist(docs_pos) print("Computing vocabulary based on Positive Class") # find popular words, popular equals more than 25 times in the corpus popular_pos_words = [] for i in all_words_pos.items(): if i[1] >= 25: popular_pos_words.append(i[0]) # Filter nouns from the popular positive class words tagged_pos_words = tagger.tag(popular_pos_words) filtered_tag_pos_words_nouns = [] for word in tagged_pos_words: if word[1] == 'noun': filtered_tag_pos_words_nouns.append(word[0]) vocab_pos = list(set(filtered_tag_pos_words_nouns)) vocabulary = list(set(vocab_pos)) # save vocabulary with open("vocab.txt", "wb") as fp: pickle.dump(vocabulary, fp) print("Computing Vocabulary: Stop") print("Length of Vocabulary: ", len(vocabulary)) print("Computing Bag of Words Vectors: Start") def build_features(doc): vector = np.zeros((1, len(vocabulary)), dtype=np.int64) for w in word_tokenize(doc): for idx, vocab in enumerate(vocabulary): if vocab == w: vector[0][idx] += 1 return vector bag_vectors = data_df["Text"].apply(build_features) feature_vectors = np.zeros((data_df.shape[0], len(vocabulary)), dtype=np.int64) for pos, index in enumerate(data_df.index.values): feature_vectors[pos, :] = bag_vectors[index] cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))] for col_index, col in enumerate(cols): data_df[col] = feature_vectors[:, col_index].reshape(data_df.shape[0], 1) print("Computing Bag of Words Vectors: Stop") print("Computing Context Transformers: Stop") print("Computing Location Transformers: Start") data_df["location_page_nr"] = data_df["page_nr"].apply(lambda x: 100 if x >= 50 else x) data_df["location_line_nr"] = data_df["line_nr"].apply(lambda x: 100 if x >= 50 else x) print("Computing Location Transformers: Stop") print("Total Number of Newly Added Features:", data_df.shape[1] - 7) print("Building ML - Neural Network Model: Start") X = data_df.drop([ "candidate", "Text", "gold", "label", "line_after", "line_at", "line_before", "line_nr", "page_nr" ], axis=1) y = data_df.gold # Normalisation X = (X - X.mean(axis=0)) / X.std(axis=0) def build_model(input_shape): model = Sequential() model.add(Dense(1024, input_shape=(input_shape, ))) model.add(Activation('sigmoid')) model.add(Dense(512)) model.add(Activation('sigmoid')) model.add(Dense(128)) model.add(Activation('sigmoid')) model.add(Dense(1, activation="sigmoid")) model.compile(optimizer='adam', loss=tf.keras.losses.mean_squared_error, metrics=['accuracy']) return model # Stratified k-Fold k_fold_outer = model_selection.StratifiedKFold(n_splits=5) scores = [] split = 0 for train_index, test_index in k_fold_outer.split(X, y): X_train, X_val = X.iloc[train_index], X.iloc[test_index] y_train, y_val = y.iloc[train_index], y.iloc[test_index] model = build_model(X_train.shape[1]) history = model.fit(X_train, y_train, epochs=5, batch_size=1024, verbose=1) results = model.evaluate(X_val, y_val) scores.append(results[1]) split += 1 del model, history, results model = build_model(X.shape[1]) model.fit(X, y, verbose=0) print('Saving the Model *.h5...') model.save('model_candidate_filter.h5') yHat_proba = model.predict(X) yHat = np.copy(yHat_proba) yHat[yHat <= 0.5] = 0 yHat[yHat > 0.5] = 1 br_score = np.around(metrics.brier_score_loss(y, yHat_proba, pos_label=1), decimals=5) print("Storing Results in .csv file") confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1])) for i in range(0, yHat_proba.shape[0]): if yHat_proba[i] <= 0.5: confidence[i] = 1 - yHat_proba[i] else: confidence[i] = yHat_proba[i] results_data_frame = pd.DataFrame( columns=["Predictions", "Confidence Level"], index=data_df.index) results_data_frame["Predictions"] = yHat.astype(np.int64).ravel() results_data_frame["Confidence Level"] = np.around(confidence, decimals=4) results_data_frame.to_csv("Results_predictions_confidence_train.csv", encoding='utf-8', header=True, index=True) return np.mean(scores), br_score
def labelClustersWithKeyPhrases(labels, myReader, num_clusters, n): top_features_list = [] tagger = PerceptronTagger() pos_tag = tagger.tag grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ # Create phrase tree chunker = nltk.RegexpParser(grammar) stop = ENGLISH_STOP_WORDS lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() # generator, generate leaves one by one def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP' or t.label() == 'JJ' or t.label() == 'RB'): yield subtree.leaves() # stemming, lematizing, lower case... def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem(word) word = lemmatizer.lemmatize(word) return word # stop-words and length control def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 <= len(word) <= 40 and word.lower() not in stop) return accepted # generator, create item once a time def get_terms(tree): for leaf in leaves(tree): term = [normalise(w) for w, t in leaf if acceptable_word(w)] # Phrase only if len(term) > 1: yield term def flatten(npTokenList): finalList = [] for phrase in npTokenList: token = '' for word in phrase: token += word + ' ' finalList.append(token.rstrip()) return finalList for cluster in range(num_clusters): indices = [index for index, clusterNum in enumerate(labels) if clusterNum == cluster] # indices of documents in cluster clusterCorpus = [doc_dict['negative_feedback'] for (docnum, doc_dict) in myReader.iter_docs() if docnum in indices] # clusterCorpus = ' '.join(clusterCorpus) counter = Counter() counter.update(flatten([word for word in get_terms(chunker.parse(pos_tag(re.findall(r'\w+', clusterCorpus)))) ])) most_common_n = counter.most_common(n) top_features = [feature[0] for feature in most_common_n] top_features_list.append(top_features) feature_names_df = pd.DataFrame(top_features_list, columns=['1', '2', '3', '4', '5']) return feature_names_df
def run_test(my_corpus): if my_corpus == treebank: print 'Corpus Info:' print ' Corpus: treebank' print ' Tagged Sents:', len(my_corpus.tagged_sents()) print ' Tagged Words:', len(my_corpus.tagged_words()) my_tagged_sents = my_corpus.tagged_sents() my_sents = my_corpus.sents() elif my_corpus == brown: print 'Corpus Info:' print ' Corpus: brown' print ' Tagged Sents:', len(my_corpus.tagged_sents()) print ' Tagged Words:', len(my_corpus.tagged_words()) print ' Tagged Sents (news):', len( my_corpus.tagged_sents(categories='news')) print ' Tagged Words (news):', len( my_corpus.tagged_words(categories='news')) my_tagged_sents = my_corpus.tagged_sents(categories='news') my_sents = my_corpus.sents(categories='news') #print ' Tagged Sents :', len(my_corpus.tagged_sents()) #print ' Tagged Words :', len(my_corpus.tagged_words()) #my_tagged_sents = my_corpus.tagged_sents() #my_sents = my_corpus.sents() else: return fold = 5 print 'Performing', fold, 'fold cross validation on corpus ...' train_accuracy = [] test_accuracy = [] train_runtime = [] test_runtime = [] for k in range(fold): train_data = [ x for i, x in enumerate(my_tagged_sents) if i % fold != k ] validation_data = [ x for i, x in enumerate(my_tagged_sents) if i % fold == k ] #test_data = [x for i, x in enumerate(my_sents) if i % fold == k] print 'Fold', k, ' has', len(train_data), 'train sentences and', len( validation_data), 'test sentences' perceptron_pos_tagger = PerceptronTagger(load=False) begin = time.time() perceptron_pos_tagger.train(train_data) end = time.time() train_acc = perceptron_pos_tagger.evaluate(train_data) train_accuracy.append(train_acc) train_runtime.append(end - begin) print ' Train accuracy =', train_acc, ' runtime =', end - begin begin = time.time() test_acc = perceptron_pos_tagger.evaluate(validation_data) end = time.time() test_accuracy.append(test_acc) test_runtime.append(end - begin) print ' Test accuracy =', test_acc, ' runtime =', end - begin print 'Results:' print '%15s %15s %15s %15s %15s' % ('Fold', 'Train-Accuracy', 'Train-Runtime', 'Test-Accuracy', 'Test-Runtime') for k in range(fold): print '%15d %15.3f%% %15.5f %15.3f%% %15.5f' % ( k, train_accuracy[k] * 100, train_runtime[k], test_accuracy[k] * 100, test_runtime[k]) avg_train_acc = sum(train_accuracy) / len(train_accuracy) avg_train_runtime = sum(train_runtime) / len(train_runtime) avg_test_acc = sum(test_accuracy) / len(test_accuracy) avg_test_runtime = sum(test_runtime) / len(test_runtime) print '%15s %15.3f%% %15.5f %15.3f%% %15.5f' % ( 'Average', avg_train_acc * 100, avg_train_runtime, avg_test_acc * 100, avg_test_runtime) return
def main(file_input): test_data = pd.read_csv(str(file_input) + '.csv') # test_data = pd.read_csv(str(file_input) + '.csv', index_col='Unnamed: 0') print("Loaded .csv file Successfully") print("Missing Value Treatment : Start") # missing values Treatment while test_data.isnull().sum().values.sum() != 0: col_with_missing_val = (test_data.isnull().sum()).argmax() test_data = test_data[test_data[col_with_missing_val].notnull( )] # drop corresponding rows that has NaN values print(col_with_missing_val) print("Missing Value Treatment : Stop") print("Total Number of Samples:", test_data.shape[0]) print("Total Number of Features:", test_data.shape[1]) print("Computing Pattern Transformers: Start") # pattern transformers pattern_strictlyDigits = "^[0-9]*$" test_data["strictly_Digits"] = test_data["candidate"].str.contains( pattern_strictlyDigits, regex=True).astype(np.int64) test_data["Number_of_Digits"] = test_data['candidate'].apply( lambda x: len(re.sub("\W", "", x))) test_data["Number_of_Seprators"] = test_data['candidate'].apply( lambda x: len(re.sub("\w", "", x))) test_data["Length_of_Candidate"] = test_data['candidate'].apply( lambda x: len(x)) print("Computing Pattern Transformers: Stop") print("Computing Context Transformers: Start") # context transformers test_data["Text"] = test_data["line_before"] + test_data[ "line_at"] + test_data["line_after"] def email_match(doc): match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc)) if match != None: return 1 else: return 0 test_data["Number_of_Characters_Text"] = test_data["Text"].apply( lambda x: len(re.sub("[^a-z]", "", str(x)))) test_data["Number_of_Digits_Text"] = test_data["Text"].apply( lambda x: len(re.sub("[^0-9]+", "", str(x)))) test_data["Number_of_Separators_Text"] = test_data["Text"].apply( lambda x: len((re.sub("[\w]+", "", str(x))).replace(" ", ""))) test_data["Email_Exists"] = test_data["Text"].apply( email_match) # place 1 everywhere email found else 0 test_data["Number_of_spaces"] = test_data["Text"].apply( lambda x: str(x).count(' ')) # counts number of spaces # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language ss = SnowballStemmer("dutch", "french") def clean_data(doc): ignore = list(set(stopwords.words( 'dutch', 'french'))) # ignore the list of stopwords exl_chars = list(set(string.punctuation)) exl_chars.append('€') doc = re.sub( "[\w\.-]+@[\w\.-]+", " ", str(doc) ) # remove email ids to avoid confiltcs in vaocabulary construction doc = re.sub("\d", " ", str(doc)) doc = ''.join([ch for ch in doc if ch not in exl_chars]) words = [] for i in word_tokenize(doc): # tokenization if i not in ignore: if len(i) >= 2: # standalone letters do not add any value i = ss.stem(i) words.append(i) doc = ' '.join(list(set(words))) return doc test_data["Text"] = test_data["Text"].apply( clean_data) # tokenize, stem and lammetize # training_corpus = alp.tagged_sents() alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) def count_adj(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_adj_adv = counts['adv'] + counts['adj'] return count_adj_adv def count_nn(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_nn = counts['noun'] return count_nn def count_verb(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_verb = counts['verb'] return count_verb test_data["Adv_Adj_Count"] = test_data["Text"].apply(count_adj) test_data["NN_count"] = test_data["Text"].apply(count_nn) test_data["Verb_count"] = test_data["Text"].apply(count_verb) print("Computing Context Transformers: Stop") # load the vocabulary with open("vocab.txt", "rb") as fp: vocabulary = pickle.load(fp) print("Computing Bag of Words Vectors: Start") def build_features(doc): vector = np.zeros((1, len(vocabulary)), dtype=np.int64) for w in word_tokenize(doc): for i, word in enumerate(vocabulary): if word == w: vector[0][i] += 1 return vector bag_vectors = test_data["Text"].apply(build_features) feature_vectors = np.zeros((test_data.shape[0], len(vocabulary)), dtype=np.int64) for pos, index in enumerate(test_data.index.values): feature_vectors[pos, :] = bag_vectors[index] cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))] for col_index, col in enumerate(cols): test_data[col] = feature_vectors[:, col_index].reshape( test_data.shape[0], 1) print("Computing Bag of Words Vectors: Stop") print("Computing Location Transformers: Start") test_data["location_page_nr"] = test_data["page_nr"].apply( lambda x: 100 if x >= 50 else x) test_data["location_line_nr"] = test_data["line_nr"].apply( lambda x: 100 if x >= 50 else x) print("Computing Location Transformers: Stop") print("Loading Model...") model = tf.keras.models.load_model('model_candidate_filter.h5') model.compile(loss=tf.keras.losses.mean_squared_error, optimizer='adam', metrics=['accuracy']) print("Loaded Model Successfully!") X_test = test_data.drop([ "candidate", "Text", "label", "line_after", "line_at", "line_before", "page_nr", "line_nr" ], axis=1) X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0) yHat_proba = model.predict(X_test) yHat = np.copy(yHat_proba) yHat[yHat <= 0.5] = 0 yHat[yHat > 0.5] = 1 print("Storing Results in .csv file") confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1])) for i in range(0, yHat_proba.shape[0]): if yHat_proba[i] <= 0.5: confidence[i] = 1 - yHat_proba[i] else: confidence[i] = yHat_proba[i] results_data_frame = pd.DataFrame( columns=["Predictions", "Confidence Level"], index=test_data.index) results_data_frame["Predictions"] = yHat.astype(np.int64).ravel() results_data_frame["Confidence Level"] = np.around(confidence, decimals=4) results_data_frame.to_csv("Results_predictions_confidence_run.csv", encoding='utf-8', header=True, index=True)
def __init__(self): nltk.download('averaged_perceptron_tagger') self.tagger = PerceptronTagger() self.lemmatizer = WordNetLemmatizer() self.stopwords = list(stopwords.words('english')) self.auto_correct_remaining = 0
def count_verbs(self, s0): tagger = PerceptronTagger() s0_tags = tagger.tag(s0) V_s0 = [values[0] for values in s0_tags if values[1] == 'VBP'] return len(V_s0)
def count_nouns(self, s0): tagger = PerceptronTagger() s0_tags = tagger.tag(s0) NN_s0 = [values[0] for values in s0_tags if values[1] == 'NN'] return len(NN_s0)
import numpy as np import torch from torch.autograd import Variable import pickle from collections import Counter from torch import nn import torch.nn.functional as F from nltk.tag import PerceptronTagger from nltk.corpus import alpino as alp from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import PunktSentenceTokenizer training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) tagger.train(training_corpus) wordTokenizer = WordPunctTokenizer() sentTokenizer = PunktSentenceTokenizer() def generate_vocabulary(data, vocabulary_size): all_data = " ".join(data) print(all_data[:100]) words = [ word for sent in sentTokenizer.tokenize(all_data) for word in wordTokenizer.tokenize(sent) ] counter = Counter(words) # most_common() produces k frequently encountered # input values and their respective counts. most_common = counter.most_common(vocabulary_size) vocabulary = set([word for word, count in most_common])
def __init__(self): super(CountAdjectives, self).__init__() self.tagger = PerceptronTagger(load=True) training_corpus = list(alpino.tagged_sents()) self.tagger.train(training_corpus)
def get_keyphrases(self, textInput, min_freq=2): # setting up tagger # (from http://stackoverflow.com/a/35964709) PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() # This grammar is described in the paper by S. N. Kim, # T. Baldwin, and M.-Y. Kan. # Evaluating n-gram based evaluation metrics for automatic # keyphrase extraction. # Technical report, University of Melbourne, Melbourne 2010. StopWords = stopwords.words('english') def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): yield subtree.leaves() def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 < len(word) and word.lower() not in StopWords) return accepted def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem(word) word = lemmatizer.lemmatize(word) return word def get_terms(tree): for leaf in leaves(tree): # can modify normalise to w.lower() if dont want to normalize word term = [normalise(w) for w, t in leaf if acceptable_word(w)] yield term def get_nounPhrases(textInput, minWordLength=2): grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) toks = nltk.word_tokenize(textInput) # print(toks) pos_tag = tagger.tag postoks = pos_tag(toks) tree = chunker.parse(postoks) terms = get_terms(tree) nounPhraseList = [] for tid, term in enumerate(terms): templist = [] for wid, word in enumerate(term): # print("TID: ",tid," WID: ",(wid+1), word) templist.append(word) s = " " nounPhraseList.append(s.join(templist)) nounPhraseList = [word for word in nounPhraseList if len(word.split()) >= minWordLength] return nounPhraseList counter = Counter() for nounPhrase in get_nounPhrases(textInput): # print(nounPhrase) counter.update([nounPhrase]) keyphraseDF = pandas.DataFrame([[key, value] for key, value in counter.items() if value>=min_freq], columns=['keyphrase_stemmed', 'frequency']) (docsDF, occurrenceDF) = self.get_occurrence(keyphraseDF) print("docs", docsDF) print("keys", keyphraseDF) keyphraseDF = keyphraseDF.join(docsDF["docs"]) print(occurrenceDF) keyphraseDF = keyphraseDF.join(self.get_fullphrases(keyphraseDF=keyphraseDF)["keyphrase_full"]) keyphraseDF = keyphraseDF.join(self.get_MIs(occurrenceDF=occurrenceDF)["MI"]) keyphraseDF = keyphraseDF.join( self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="positive")["PMI_pos"]) keyphraseDF = keyphraseDF.join( self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="negative")["PMI_neg"]) #keyphraseDF = keyphraseDF.join(self.get_PMIs(keyphraseDF["Keyphrase_stemmed"].tolist(), "neg")) return keyphraseDF
from nltk.tag import PerceptronTagger from nltk.data import find import glob #code for loading perceptron tagger PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) pos_tag = tagger.tag #list to store POS and NP lists generated from each file GlobalPOSList=[] GlobalNPList=[] #getting filenames of dataset files fileList=glob.glob("C:/Users/Vinod Chhapariya/Desktop/TDBMS/Benchmark Dataset/*.txt") #printing filenames for filename in fileList: print filename #POS tagging using Preceptron tagger for filename in fileList: POSList=[] NPList=[] filePOSTagWrite=open(filename+"_POSTag_Perceptron",'w') for line in open(filename,'r').readlines(): tags=pos_tag(line.split())
#sentence= sentence.rstrip() doc = nlp(sentence) for token in doc: dependency = [token.text, token.dep_, token.shape_, token.is_alpha, token.is_stop,[child for child in token.children]] if dependency[0] == "\n": whole_sen.append(parsed) parsed=[] else: parsed.append(dependency) frysian=[] tagger = PerceptronTagger() with open('frysian_data.txt', 'r', encoding="utf-8") as fr_infile: for sentence in fr_infile: sentence = word_tokenize(sentence) pos = tagger.tag(sentence) fr.append(pos) other=[] final =[] fr_longer=[] for k in range(len(fr)): fries = fr[k] parsed = whole_sen[k] if len(fries) == len(parsed): for words, fr_words in zip(parsed,fries): print(words[0])
class text_clean: def __init__(self): pass def punctuation_trimming(self, sent): y = [x for x in sent if x not in string.punctuation] return y def special_char_removal(self, tok): z = [re.sub('[^A-Za-z0-9]+', '', token) for token in tok] z = [x for x in z if x] return z #Remove stop words stop_words = set(stopwords.words('english')) def stopw_rem(self, tok): clean_tokens = tok[:] for token in tok: if token in self.stop_words: clean_tokens.remove(token) return clean_tokens #Convert to lower case def conv_to_lower(slef, sent): newtok = [item.lower() for item in sent] return newtok #POS tagger PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:' + str( find('taggers/averaged_perceptron_tagger/' + PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) pos_tag = tagger.tag #Extract Noun def noun_iden(self, sent): tok = word_tokenize(sent) nountok = [ word for (word, pos) in self.pos_tag(tok) if pos[:2] == 'NN' ] return nountok # Identify the POS and lemmatize according using the parameter pos in lemmatization ### Lemmatization lemmatizer = WordNetLemmatizer() def lemm(self, sent): tok = word_tokenize(sent) tok2 = [] for word, tag in self.pos_tag(tok): if tag.startswith("NN"): temp = self.lemmatizer.lemmatize(word, pos='n') elif tag.startswith('VB'): temp = self.lemmatizer.lemmatize(word, pos='v') elif tag.startswith('JJ'): temp = self.lemmatizer.lemmatize(word, pos='a') else: temp = word tok2.append(temp) return ' '.join(tok2) ### Stemming ps = PorterStemmer() def stem(self, sent): newtok = [self.ps.stem(w) for w in word_tokenize(sent)] return ' '.join(newtok)
from collections import Counter #count_good_raw = Counter(good_raw) count_good_actors = Counter(good_actors) count_good_actions = Counter(good_actions) #number of statements nos = len(tokenized_actions) #number of good actors noga = len(count_good_actors) #number of good actors nogc = len(count_good_actions) PICKLE = "taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle" import nltk.data from nltk.tag import PerceptronTagger _nltk_pos_tagger = PerceptronTagger(load=False) _nltk_pos_tagger.load(PICKLE) print(count_good_actors) S = np.zeros(shape=(nos, noga + nogc)) i = 0 for sent_pos in tokenized_actors: for token1 in sent_pos: j = 0 tt1 = _nltk_pos_tagger.tag([token1]) for feature in count_good_actors: ft = _nltk_pos_tagger.tag([feature]) simval = word_sim(tt1[0], ft[0], i) S[i][j] = S[i][j] + simval j = j + 1 i = i + 1