def vectorizer(tokens, w2v_db): db_path = w2v_db # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors of words. Maintain order as in document. token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words. conn.close() return unsorted_kw, token_vecs
def stanford_corenlp_filter(sent): from nltk.tag.stanford import POSTagger posTagger = POSTagger( '/Users/gt/Downloads/' 'stanford-postagger-2013-06-20/models/' 'wsj-0-18-bidirectional-nodistsim.tagger', '/Users/gt/Downloads/stanford-postagger-2013-06-20' '/stanford-postagger-3.2.0.jar', encoding=encoding) b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' return filtered_sent
def nltk_stanfordpos(inpath, outfolder): """POS-Tagging French text with Stanford POS-Tagger via NLTK.""" print("\nLaunched nltk_stanfordpos.") import os import glob from nltk.tag.stanford import POSTagger for file in glob.glob(inpath): st = POSTagger( '/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8") with open(file, "r", encoding="utf-8") as infile: untagged = infile.read() tagged = st.tag(untagged.split()) taggedstring = "" for item in tagged: item = "\t".join(item) taggedstring = taggedstring + str(item) + "\n" #print(taggedstring) basename = os.path.basename(file) cleanfilename = basename if not os.path.exists(outfolder): os.makedirs(outfolder) with open(os.path.join(outfolder, cleanfilename), "w") as output: output.write(taggedstring) print("Done.")
def __init__(self): self.st = POSTagger( os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/models/english-bidirectional-distsim.tagger'), os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/stanford-postagger.jar'))
def __init__(self, override=False): tagger_path = os.path.join(DIRS.user_data_dir, stanford_postagger_name) if not os.path.exists(tagger_path): raise LookupError("Stanford POS tagger not found. Try running the " "command download_third_party_data.py") postagger = POSTagger( os.path.join(tagger_path, 'models', 'english-bidirectional-distsim.tagger'), os.path.join(tagger_path, 'stanford-postagger.jar'), encoding='utf8') super(StanfordTaggerRunner, self).__init__(postagger.batch_tag, override)
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w, t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs) / 4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data, nk, iter=20, minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir, name + '.json') file_dest = open(json_path, 'w') json.dump( { 'url': url, 'vectors': token_vecs, 'keyword_frequency': unsorted_kw, 'centroids': centroids }, file_dest) file_dest.close()
def tag(segments): #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar') st = POSTagger( os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'), os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar')) tagged = [] for segment in segments: x = ' '.join( nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment))) tagged.append(x.decode('utf-8')) return tagged
def pos_tag(to_tag, model_path=root_path + "\\stanford-postagger-full-2013-06-20\\models\\french.tagger", jar_path=root_path + "\\stanford-postagger-full-2013-06-20\\stanford-postagger.jar"): '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file''' pos_tagger = POSTagger( model_path, jar_path, encoding='utf8' ) #create an object of class POSTagger that is encoded in UTF-8 tags = pos_tagger.tag( to_tag) #run the tagging algorithm on the tokenized raw text return tags
def __init__(self, pathToParser=None, javaHeapOptions='-Xmx4g -XX:+UseParallelGC -XX:-UseGCOverheadLimit'): if pathToParser is None: taggerLibraryPath = normpath(os.path.join(os.getcwd(), "sp/jar/stanford-postagger.jar")) taggerModelPath = normpath(os.path.join(os.getcwd(), "sp/models/english-bidirectional-distsim.tagger")) else: taggerLibraryPath = normpath(os.path.join(pathToParser, "sp/jar/stanford-postagger.jar")) taggerModelPath = normpath(os.path.join(pathToParser, "sp/models/english-bidirectional-distsim.tagger")) self.stanfordTagger = POSTagger(taggerModelPath, taggerLibraryPath, java_options=javaHeapOptions) """
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger('english-bidirectional-distsim.tagger', 'stanford-postagger.jar') return st.tag(toked_sentence)
def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir): try: self.posTagger = POSTagger(posTagModelPath, posTaggerPath, "UTF-8") print "pos tagger is loaded" except: print "Error in loading POS tagger" try: self.parser = MaltParser(tagger=None, mco=parserModelPath, working_dir=workingDir) print "parser is loaded" except: print "Error in loading the MALT Parser"
def pos_tag(sent, tagger='stanford'): # saves pos_tagger as global variable, # such that it is not recreated everytime pos_tag is executed if not 'pos_tagger' in globals(): global pos_tagger pos_tagger = POSTagger(conf.stanford_pos_model, path_to_jar=conf.stanford_postagger, encoding='UTF-8') if tagger == 'nltk' : tokens = tokenize(sent, 's') return nltk.pos_tag(tokens) elif tagger == 'stanford' : tokens = tokenize(sent,'w') return pos_tagger.tag(tokens) else : raise ValueError('No such tagger: ' + tagger)
def add_POS(self, row_file, target): ''' row_str = ''; f = open(row_file,'rb'); for row in f: row_str+=row; soup = BeautifulSoup(row_str); self.soup = soup; sentences = soup.find_all('sentence'); all_token = list(); for block in sentences: text = block.text.strip(); text_token = self.tf.stanford_tokenize(text); all_token.append(text_token); ''' all_token = self.get_token(target) stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar') tag_list = list() for row in all_token: temp_list = list() for word in row: if len(word) > 1 and re.match(r'^[A-Z]+', word): temp_list.append(word.lower()) else: temp_list.append(word) tag_list.append(temp_list) 1 #end for tagged_result = stanford_tagger.tag_sents(tag_list) ''' for row in tagged_result: index_list = list(); for num,item in enumerate(row): if not re.match(r'.*[\w\d]+',item[0]): index_list.append(num); for i in index_list: row[i]=(row[i][0],row[i][0]); #end for ''' w = open('pos_%s' % target, 'wb') for num1, row in enumerate(tagged_result): for num2, item in enumerate(row): w.write(all_token[num1][num2] + ' ' + item[1] + '\n') w.write('\n') #print tagged_result; return
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples8qfa Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger( '/home/satyam/zip/opinionproject/opinion_mining/resources/english-bidirectional-distsim.tagger', '/home/satyam/zip/opinionproject/opinion_mining/resources/stanford-postagger.jar' ) return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger( '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar' ) return st.tag(toked_sentence)
def main(): data_file = open("../data/good_data.txt", "r") out_file = open("../data/good_lines_tags_1.txt", "w") lines = data_file.readlines() data_file.close() line_count = 0 english_postagger = POSTagger( '../postagger/models/english-bidirectional-distsim.tagger', '../postagger/stanford-postagger.jar') for line in lines: tag_list = [] for t in english_postagger.tag(line.split('\n')[0].split(' ')): tag_list.append(t[1]) out_file.write(" ".join(tag_list)) out_file.write("\n") print "completed line" + str(line_count) line_count += 1 out_file.close()
def main(word_transformation=None, result_path=None, save=SAVE, n=500): tagger = POSTagger( '/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger', '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar') tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:] print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w, t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w, t in sent] for sent in tagged_corpus) correct_tags = [[t for w, t in sent] for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i + 1) % 5 == 0: print "%d finished" % (i + 1) try: ptags = [t for w, t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" % (sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent except Exception: traceback.print_exc() if save: print "dumping to '%s'" % (result_path) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger import os basePath = os.getcwd() st = POSTagger( path + '/resources/stanford-postagger-2015-12-09/models/english-bidirectional-distsim.tagger', path + '/resources/stanford-postagger-2015-12-09/stanford-postagger.jar') return st.tag(toked_sentence)
def pos_data(self, method='stanford'): ''' pos data with alternative method --stanford with pos-tagger writen by stanford,or --nltk (other word) with the pos-tagger inside NLTK ''' print '正在标注语料....' my_tag = int if method == 'stanford': st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\ ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar') my_tag = st.tag_sents #get tagged train_data sentences = list() for sentence in self.train_data: sentences.append(self.tk.word_tokenize(sentence)) self.tagged_train_data = my_tag(sentences) #get tagged test_data sentences = list() for sentence in self.test_data: sentences.append(self.tk.word_tokenize(sentence)) self.tagged_test_data = my_tag(sentences) elif method == 'nltk': my_tag = nltk.pos_tag #get tagged train_data tagged_train_data = list() for row in self.train_data: tagged_train_data.append(my_tag(row.split())) #get tagged test_data tagged_test_data = list() for row in self.test_data: tagged_test_data.append(my_tag(row.split())) self.tagged_train_data = tagged_train_data self.tagged_test_data = tagged_test_data pickle.dump(self.tagged_train_data, open('__tagged_train_data', 'wb')) pickle.dump(self.tagged_test_data, open('__tagged_test_data', 'wb')) #self.tagged_train_data=pickle.load(open('__tagged_train_data','rb')); #self.tagged_test_data=pickle.load(open('__tagged_test_data','rb')); print '完成!' return
def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir): try: self.logger = logging.getLogger(__name__) self.posTagger = POSTagger(posTagModelPath, posTaggerPath,encoding="UTF-8", java_options='-Xmx16000m') #self.posTagger = POSTagger(posTagModelPath, posTaggerPath,"UTF-8") #print "pos tagger is loaded" except: self.logger.warning("Error in loading POS tagger!") e = sys.exc_info()[0] self.logger.warning("Error:" + str(e)) try: self.parser = MaltParser(tagger=None, mco = parserModelPath, working_dir= workingDir, additional_java_args=['-Xmx16000m']) #print "parser is loaded" except: self.logger.warning("Error in loading the MALT Parser") e = sys.exc_info()[0] self.logger.warning("Error:" + str(e))
def __init__(self, name, is_lazy, lazy_directory, debug, encoding, tag_separator, stanford_jar_path, language_model_path): """ Constructor of the component. @param name: The name of the component. @type name: C{string} @param is_lazy: True if the component must load previous data, False if data must be computed tought they have already been computed. @type is_lazy: C{bool} @param lazy_directory: The directory used to store previously computed data. @type lazy_directory: C{string} @param debug: True if the component is in debug mode, else False. When the component is in debug mode, it will output each step of its processing. @type debug: C{bool} @param encoding: The encoding of the files to pre-process. @type encoding: C{string} @param tag_separator: The symbol to use as a separator between a word and its POS tag. @type tag_separator: C{string} @param stanford_jar_path: The path to the jar of the Java Stanford Tagger. @type stanford_jar_path: C{string} @param language_model_path: The path to the language-specific stafonrd's model. @type language_model_path: C{string} """ super(StanfordPreProcessor, self).__init__(name, is_lazy, lazy_directory, debug, encoding, tag_separator) self.set_sentence_tokenizer(PunktSentenceTokenizer()) self.set_pos_tagger( POSTagger(language_model_path, stanford_jar_path, encoding))
def generate_pos_set(self): print '正在构建正性集词典....' pos_dict = dict() pos_set = set() sentences = list() for row in self.train_label: for key in row: if ' ' in key: sentences.append(self.tk.word_tokenize(key)) else: pos_dict[key] = pos_dict.setdefault(key, 0) + 1 #pos_set.add(key); #end for st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\ ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar') result = st.tag_sents(sentences) for row in result: for item in row: if item[1].startswith('NN'): pos_dict[item[0]] = pos_dict.setdefault(item[0], 0) + 1 #pos_set.add(item[0]); #end for neg_dict = dict() for num, row in enumerate(self.tagged_train_data): for item in row: if item[1].startswith( 'NN') and item[0] not in self.train_word_label[num]: neg_dict[item[0]] = neg_dict.setdefault(item[0], 0) + 1 for key in pos_dict.keys(): if pos_dict[key] > 1: if neg_dict.has_key(key): if neg_dict[key] / pos_dict[key] < 2: pos_set.add(key) else: pos_set.add(key) self.pos_set = pos_set print '完成!' return
def get_whole(self, sentence): opinion_dict = dict() pos_f = open('../opinion-lexicon-English/positive-words.txt', 'rb') neg_f = open('../opinion-lexicon-English/negative-words.txt', 'rb') for _ in xrange(35): pos_f.readline() neg_f.readline() for word in pos_f: opinion_dict[word.strip()] = True for word in neg_f: opinion_dict[word.strip()] = False pos_f.close() neg_f.close() stemmer = PorterStemmer() stanford_parser = parser.Parser() stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar') w = open('sentence_test', 'wb') text_token = self.tf.stanford_tokenize(sentence) text_pos = stanford_tagger.tag(text_token) print text_pos text_dependency = stanford_parser.parseToStanfordDependencies(sentence) temp_list = ['none'] * len(text_token) for dep in text_dependency: if dep[0] == 'amod': temp_list[int(dep[1])] = '%s_1' % dep[0] temp_list[int(dep[2])] = '%s_2' % dep[0] #end for for num, item in enumerate(text_pos[0]): temp_str = 'order' if opinion_dict.has_key(item[0]): temp_str = 'opion' featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\ temp_str,temp_list[num],'O'] w.write(' '.join(featrue_list) + '\n') pass
#!/usr/bin/env python # -*- coding: utf-8 -* import numpy import nltk from nltk.tag.stanford import POSTagger import sys if len(sys.argv) != 2: print 'must have one argument' sys.exit() chunk = sys.argv[1].decode('utf-8') #chunk = u"妈我" text = nltk.word_tokenize(chunk.encode('utf-8')) st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar') tsentence = st.tag(text) # print tsentence for w in tsentence: # print w # print w[1].decode('utf-8'), print w[1].split('#')[1]
def pos_stanford(tokens): tagger = POSTagger('./english-bidirectional-distsim.tagger', './stanford-postagger.jar') return tagger.tag(tokens)
import os from nltk import * from nltk.tag.stanford import POSTagger from nltk.stem.wordnet import WordNetLemmatizer PATH_TO_TAGGER = os.path.join(os.getcwd(), "lib\\wsj-0-18-bidirectional-nodistsim.tagger") PATH_TO_JAR = os.path.join(os.getcwd(), "lib\\stanford-postagger.jar") print pos_tag("the sea touches me".split()) stanford_tagger = POSTagger(PATH_TO_TAGGER,PATH_TO_JAR) print stanford_tagger.tag(word_tokenize("which ocean touches the state of California ?"))
# POS tagging >>>import nltk >>>from nltk import word_tokenize >>>s="I was watching TV" >>>print nltk.pos_tag(word_tokenize(s)) # all nouns >>>tagged=nltk.pos_tag(word_tokenize(s)) >>>allnoun=[word for word,pos in tagged if pos in ['NN','NNP'] ] # Stanford POS tagger >>>from nltk.tag.stanford import POSTagger >>>import nltk >>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar') >>>tokens =nltk.word_tokenize(s) >>>stan_tagger.tag(tokens) # POS tags freq distribtuion >>>from nltk.corpus import brown >>>import nltk >>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')] >>>print nltk.FreqDist(tags) # default tagger >>>brown_tagged_sents = brown.tagged_sents(categories='news') >>>default_tagger = nltk.DefaultTagger('NN') >>>print default_tagger.evaluate(brown_tagged_sents) # N-gram taggers
from scipy.sparse import hstack import os __author__ = 'Jasneet Sabharwal' _POS_TAGGER_MODEL_PATH = os.path.join( os.path.dirname(__file__), '..', '..', 'lib/english-bidirectional-distsim.tagger') _POS_TAGGER_JAR_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'lib/stanford-postagger.jar') _SENTI_WORDNET_FILE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'lib/SentiWordNet_3.0.0_20130122.txt') _BOW_VOCAB_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'lib/bow_vocab') POS_TAGGER = POSTagger(_POS_TAGGER_MODEL_PATH, _POS_TAGGER_JAR_PATH) SENTI_WORDNET = SentiWordNetCorpusReader(_SENTI_WORDNET_FILE_PATH) BOW_VECTORIZER = CountVectorizer( min_df=1, binary=True, dtype='float64', lowercase=True, ngram_range=(1, 1), stop_words=stopwords.words('english'), vocabulary=utils.get_bow_vocab(_BOW_VOCAB_PATH)) def _pos_features(pos_tags): pos_tags = [(word, tag) for (word, tag) in pos_tags if not word.lower() in stopwords.words('english')] features = defaultdict(int)
from nltk.tag.stanford import POSTagger import textprocess as tp import os, time #Wraps the part of speech taggin functionality within this file try: pwd = os.path.dirname(os.path.realpath(__file__)) print pwd except: print 'Something screwed up, using os.getcwd() instead' pwd = os.getcwd() print "POSTagger Loaded" post = POSTagger(pwd+'/stanford-postagger/models/english-bidirectional-distsim.tagger', pwd+"/stanford-postagger/stanford-postagger.jar") def tag(text): text = tp.preprocess(text) #print text t1 = time.time() outlist = post.tag(text.split()) t2 = time.time() print "POS Tagging complete. Time taken: ", t2-t1, " seconds" return outlist
def evaluate(granularity, text): preprocessor = Preprocessor() entry = TextEntry() entry.body = text preprocessor.entries = [entry] data = preprocessor.get_clean_data() ncharsAll = preprocessor.getNChars(items=data, freq=20) test_data_raw = preprocessor.get_clean_data() test_raw_text = preprocessor.get_raw_words() count_vect = joblib.load('../models/t1/vec_count.joblib') tfidf_transform = joblib.load('../models/t1/tfidf_transform.joblib') data_counts = count_vect.transform(test_data_raw) test_data = tfidf_transform.transform(data_counts) dense_test = test_data.toarray() vocab = count_vect.vocabulary_ nchars = [] for nchar in ncharsAll: if nchar not in vocab: nchars.append(nchar) numOfTags = len(tags) ncharVecSize = len(nchars) tag_vecs = [] pos = POSTagger(model, jar, java_options='-mx2500m') for i, text in enumerate(test_raw_text): if i % 10 == 0: print(i) words = text.split() tag_vector = np.zeros(numOfTags) words_with_tags = pos.tag(words) only_tags = [tag for word, tag in words_with_tags[0]] tags_with_freq = Counter(only_tags) for tag, freq in tags_with_freq.items(): tag_vector[tags.index(tag)] = freq / len(words) tag_vecs.append(tag_vector) for i, text in enumerate(test_raw_text): if i % 100 == 0: print(i) words = text.split() ncharVec = np.zeros(ncharVecSize) for word in words: for size in sizes: text_nchars = [ word[i:i + size] for i in range(len(word) - size + 1) ] text_nchars_with_freq = Counter(text_nchars) for nchar, freq in text_nchars_with_freq.items(): if nchar in nchars: ncharVec[nchars.index(nchar)] = freq / len(words) test_data[i] = np.concatenate((dense_test[i], ncharVec, tag_vecs[i])) svm_l = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' + granularity + '.joblib') svm_u = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' + granularity + '.joblib') evaluator = ClfEval(svm_l, svm_u) return evaluator.eval_data(csr_matrix(test_data))