Example #1
0
def vectorizer(tokens, w2v_db):
    db_path = w2v_db
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)
    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
            
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label
    # Get the vectors of words. Maintain order as in document.
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
    conn.close()
    return unsorted_kw, token_vecs
Example #2
0
def stanford_corenlp_filter(sent):
    from nltk.tag.stanford import POSTagger
    posTagger = POSTagger(
        '/Users/gt/Downloads/'
        'stanford-postagger-2013-06-20/models/'
        'wsj-0-18-bidirectional-nodistsim.tagger',
        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
        '/stanford-postagger-3.2.0.jar',
        encoding=encoding)

    b1, b2 = sent.split(blockSeparator)
    b2 = b2.rstrip()

    b1 = b1.lower()
    tokens = word_tokenize(b1)
    pos_tags = posTagger.tag(tokens)
    filtered_sent = ' '
    for pos_t in pos_tags:
        if pos_t[1] in filterList:
            # filtered_sent += stemmer.stem(pos_t[0]) + ' '
            filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

            #note: 1 concat stemmer(word) == stemmer(1 concat word)

    b2 = b2.lower()
    tokens = word_tokenize(b2)
    pos_tags = posTagger.tag(tokens)
    filtered_sent = ' '
    for pos_t in pos_tags:
        if pos_t[1] in filterList:
            # filtered_sent += stemmer.stem(pos_t[0]) + ' '
            filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

    return filtered_sent
Example #3
0
File: tmw.py Project: daschloer/tmw
def nltk_stanfordpos(inpath, outfolder):
    """POS-Tagging French text with Stanford POS-Tagger via NLTK."""
    print("\nLaunched nltk_stanfordpos.")

    import os
    import glob
    from nltk.tag.stanford import POSTagger

    for file in glob.glob(inpath):
        st = POSTagger(
            '/home/christof/Programs/stanfordpos/models/french.tagger',
            '/home/christof/Programs/stanfordpos/stanford-postagger.jar',
            encoding="utf8")
        with open(file, "r", encoding="utf-8") as infile:
            untagged = infile.read()
            tagged = st.tag(untagged.split())

            taggedstring = ""
            for item in tagged:
                item = "\t".join(item)
                taggedstring = taggedstring + str(item) + "\n"
            #print(taggedstring)

            basename = os.path.basename(file)
            cleanfilename = basename
            if not os.path.exists(outfolder):
                os.makedirs(outfolder)
            with open(os.path.join(outfolder, cleanfilename), "w") as output:
                output.write(taggedstring)
    print("Done.")
Example #4
0
 def __init__(self):
     self.st = POSTagger(
         os.path.normpath(
             os.path.dirname(os.path.realpath(__file__)) +
             '/stanford-pos/models/english-bidirectional-distsim.tagger'),
         os.path.normpath(
             os.path.dirname(os.path.realpath(__file__)) +
             '/stanford-pos/stanford-postagger.jar'))
Example #5
0
    def __init__(self, override=False):
        tagger_path = os.path.join(DIRS.user_data_dir, stanford_postagger_name)
        if not os.path.exists(tagger_path):
            raise LookupError("Stanford POS tagger not found. Try running the "
                              "command download_third_party_data.py")

        postagger = POSTagger(
            os.path.join(tagger_path, 'models', 'english-bidirectional-distsim.tagger'),
            os.path.join(tagger_path, 'stanford-postagger.jar'),
            encoding='utf8')
        super(StanfordTaggerRunner, self).__init__(postagger.batch_tag, override)
Example #6
0
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger',
                       'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w, t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs) / 4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data, nk, iter=20, minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir, name + '.json')
    file_dest = open(json_path, 'w')
    json.dump(
        {
            'url': url,
            'vectors': token_vecs,
            'keyword_frequency': unsorted_kw,
            'centroids': centroids
        }, file_dest)
    file_dest.close()
Example #7
0
def tag(segments):
    #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
    st = POSTagger(
        os.path.join(stanford_path,
                     'models/english-left3words-distsim.tagger'),
        os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
    tagged = []
    for segment in segments:
        x = ' '.join(
            nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
        tagged.append(x.decode('utf-8'))
    return tagged
Example #8
0
def pos_tag(to_tag,
            model_path=root_path +
            "\\stanford-postagger-full-2013-06-20\\models\\french.tagger",
            jar_path=root_path +
            "\\stanford-postagger-full-2013-06-20\\stanford-postagger.jar"):
    '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file'''
    pos_tagger = POSTagger(
        model_path, jar_path, encoding='utf8'
    )  #create an object of class POSTagger that is encoded in UTF-8
    tags = pos_tagger.tag(
        to_tag)  #run the tagging algorithm on the tokenized raw text
    return tags
Example #9
0
        def __init__(self, pathToParser=None, javaHeapOptions='-Xmx4g -XX:+UseParallelGC -XX:-UseGCOverheadLimit'):

                if pathToParser is None:
                        taggerLibraryPath = normpath(os.path.join(os.getcwd(), "sp/jar/stanford-postagger.jar"))
                        taggerModelPath = normpath(os.path.join(os.getcwd(), "sp/models/english-bidirectional-distsim.tagger"))
                else:
                        taggerLibraryPath = normpath(os.path.join(pathToParser, "sp/jar/stanford-postagger.jar"))
                        taggerModelPath = normpath(os.path.join(pathToParser, "sp/models/english-bidirectional-distsim.tagger"))

                self.stanfordTagger = POSTagger(taggerModelPath,
                        taggerLibraryPath, java_options=javaHeapOptions)

                """
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger('english-bidirectional-distsim.tagger',
                   'stanford-postagger.jar')

    return st.tag(toked_sentence)
Example #11
0
    def __init__(self, posTagModelPath, posTaggerPath, parserModelPath,
                 workingDir):

        try:
            self.posTagger = POSTagger(posTagModelPath, posTaggerPath, "UTF-8")
            print "pos tagger is loaded"
        except:
            print "Error in loading POS tagger"

        try:
            self.parser = MaltParser(tagger=None,
                                     mco=parserModelPath,
                                     working_dir=workingDir)
            print "parser is loaded"
        except:
            print "Error in loading the MALT Parser"
Example #12
0
def pos_tag(sent, tagger='stanford'):
    
    # saves pos_tagger as global variable,
    # such that it is not recreated everytime pos_tag is executed
    if not 'pos_tagger' in globals():
        global pos_tagger
        pos_tagger = POSTagger(conf.stanford_pos_model, path_to_jar=conf.stanford_postagger, encoding='UTF-8')

    if tagger == 'nltk' :
        tokens = tokenize(sent, 's')
        return nltk.pos_tag(tokens)
    elif tagger == 'stanford' :
        tokens = tokenize(sent,'w')
        return pos_tagger.tag(tokens)
    else :
        raise ValueError('No such tagger: ' + tagger)
Example #13
0
 def add_POS(self, row_file, target):
     '''
     row_str = '';
     f = open(row_file,'rb');
     for row in f:
         row_str+=row;
     soup = BeautifulSoup(row_str);
     self.soup = soup;
     sentences = soup.find_all('sentence');
     all_token = list();
     for block in sentences:
         text = block.text.strip();
         text_token = self.tf.stanford_tokenize(text);
         all_token.append(text_token);
     '''
     all_token = self.get_token(target)
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     tag_list = list()
     for row in all_token:
         temp_list = list()
         for word in row:
             if len(word) > 1 and re.match(r'^[A-Z]+', word):
                 temp_list.append(word.lower())
             else:
                 temp_list.append(word)
         tag_list.append(temp_list)
         1
     #end for
     tagged_result = stanford_tagger.tag_sents(tag_list)
     '''
     for row in tagged_result:
         index_list = list();
         for num,item in enumerate(row):
             if not re.match(r'.*[\w\d]+',item[0]):
                 index_list.append(num);
         for i in index_list:
             row[i]=(row[i][0],row[i][0]);
     #end for
     '''
     w = open('pos_%s' % target, 'wb')
     for num1, row in enumerate(tagged_result):
         for num2, item in enumerate(row):
             w.write(all_token[num1][num2] + ' ' + item[1] + '\n')
         w.write('\n')
     #print tagged_result;
     return
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples8qfa

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger(
        '/home/satyam/zip/opinionproject/opinion_mining/resources/english-bidirectional-distsim.tagger',
        '/home/satyam/zip/opinionproject/opinion_mining/resources/stanford-postagger.jar'
    )

    return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger(
        '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger',
        '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar'
    )

    return st.tag(toked_sentence)
Example #16
0
def main():
    data_file = open("../data/good_data.txt", "r")
    out_file = open("../data/good_lines_tags_1.txt", "w")
    lines = data_file.readlines()
    data_file.close()
    line_count = 0
    english_postagger = POSTagger(
        '../postagger/models/english-bidirectional-distsim.tagger',
        '../postagger/stanford-postagger.jar')
    for line in lines:
        tag_list = []
        for t in english_postagger.tag(line.split('\n')[0].split(' ')):
            tag_list.append(t[1])
        out_file.write(" ".join(tag_list))
        out_file.write("\n")
        print "completed line" + str(line_count)
        line_count += 1
    out_file.close()
Example #17
0
def main(word_transformation=None, result_path=None, save=SAVE, n=500):
    tagger = POSTagger(
        '/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger',
        '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar')

    tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:]

    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w, t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w, t in sent] for sent in tagged_corpus)

    correct_tags = [[t for w, t in sent] for sent in tagged_corpus]

    print "predicting"
    predicted_tags = []
    really_correct_tags = []  # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i + 1) % 5 == 0:
            print "%d finished" % (i + 1)
        try:
            ptags = [t for w, t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" % (sent)
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent
        except Exception:
            traceback.print_exc()

    if save:
        print "dumping to '%s'" % (result_path)
        dump((really_correct_tags, predicted_tags, sentences),
             open(result_path, "w"))
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    import os

    basePath = os.getcwd()
    st = POSTagger(
        path +
        '/resources/stanford-postagger-2015-12-09/models/english-bidirectional-distsim.tagger',
        path +
        '/resources/stanford-postagger-2015-12-09/stanford-postagger.jar')

    return st.tag(toked_sentence)
Example #19
0
    def pos_data(self, method='stanford'):
        '''
        pos data with alternative method --stanford with pos-tagger writen by
        stanford,or --nltk (other word) with the pos-tagger inside NLTK
        '''
        print '正在标注语料....'
        my_tag = int
        if method == 'stanford':
            st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
                        ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
            my_tag = st.tag_sents
            #get tagged train_data
            sentences = list()
            for sentence in self.train_data:
                sentences.append(self.tk.word_tokenize(sentence))
            self.tagged_train_data = my_tag(sentences)
            #get tagged test_data
            sentences = list()
            for sentence in self.test_data:
                sentences.append(self.tk.word_tokenize(sentence))
            self.tagged_test_data = my_tag(sentences)
        elif method == 'nltk':
            my_tag = nltk.pos_tag
            #get tagged train_data
            tagged_train_data = list()
            for row in self.train_data:
                tagged_train_data.append(my_tag(row.split()))
            #get tagged test_data
            tagged_test_data = list()
            for row in self.test_data:
                tagged_test_data.append(my_tag(row.split()))

            self.tagged_train_data = tagged_train_data
            self.tagged_test_data = tagged_test_data
        pickle.dump(self.tagged_train_data, open('__tagged_train_data', 'wb'))
        pickle.dump(self.tagged_test_data, open('__tagged_test_data', 'wb'))
        #self.tagged_train_data=pickle.load(open('__tagged_train_data','rb'));
        #self.tagged_test_data=pickle.load(open('__tagged_test_data','rb'));
        print '完成!'
        return
	def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir):
		
		
		
		try:
			self.logger = logging.getLogger(__name__)
			self.posTagger = POSTagger(posTagModelPath, posTaggerPath,encoding="UTF-8", java_options='-Xmx16000m')
			#self.posTagger = POSTagger(posTagModelPath, posTaggerPath,"UTF-8")
			#print "pos tagger is loaded"
		except:
			self.logger.warning("Error in loading POS tagger!")
			e = sys.exc_info()[0]
			self.logger.warning("Error:" + str(e))
					
		
		try:
			self.parser = MaltParser(tagger=None, mco = parserModelPath, working_dir= workingDir, additional_java_args=['-Xmx16000m']) 
			#print "parser is loaded"
		except:
			self.logger.warning("Error in loading the MALT Parser")
			e = sys.exc_info()[0]
			self.logger.warning("Error:" + str(e))				
Example #21
0
    def __init__(self, name, is_lazy, lazy_directory, debug, encoding,
                 tag_separator, stanford_jar_path, language_model_path):
        """
    Constructor of the component.

    @param  name:                 The name of the component.
    @type   name:                 C{string}
    @param  is_lazy:              True if the component must load previous data,
                                  False if data must be computed tought they
                                  have already been computed.
    @type   is_lazy:              C{bool}
    @param  lazy_directory:       The directory used to store previously
                                  computed data.
    @type   lazy_directory:       C{string}
    @param  debug:                True if the component is in debug mode, else
                                  False. When the component is in debug mode, it
                                  will output each step of its processing.
    @type   debug:                C{bool}
    @param  encoding:             The encoding of the files to pre-process.
    @type   encoding:             C{string}
    @param  tag_separator:        The symbol to use as a separator between a
                                  word and its POS tag.
    @type   tag_separator:        C{string}
    @param  stanford_jar_path:    The path to the jar of the Java Stanford
                                  Tagger.
    @type   stanford_jar_path:    C{string}
    @param  language_model_path:  The path to the language-specific stafonrd's
                                  model.
    @type   language_model_path:  C{string}
    """

        super(StanfordPreProcessor,
              self).__init__(name, is_lazy, lazy_directory, debug, encoding,
                             tag_separator)

        self.set_sentence_tokenizer(PunktSentenceTokenizer())
        self.set_pos_tagger(
            POSTagger(language_model_path, stanford_jar_path, encoding))
Example #22
0
 def generate_pos_set(self):
     print '正在构建正性集词典....'
     pos_dict = dict()
     pos_set = set()
     sentences = list()
     for row in self.train_label:
         for key in row:
             if ' ' in key:
                 sentences.append(self.tk.word_tokenize(key))
             else:
                 pos_dict[key] = pos_dict.setdefault(key, 0) + 1
                 #pos_set.add(key);
     #end for
     st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
                     ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     result = st.tag_sents(sentences)
     for row in result:
         for item in row:
             if item[1].startswith('NN'):
                 pos_dict[item[0]] = pos_dict.setdefault(item[0], 0) + 1
                 #pos_set.add(item[0]);
     #end for
     neg_dict = dict()
     for num, row in enumerate(self.tagged_train_data):
         for item in row:
             if item[1].startswith(
                     'NN') and item[0] not in self.train_word_label[num]:
                 neg_dict[item[0]] = neg_dict.setdefault(item[0], 0) + 1
     for key in pos_dict.keys():
         if pos_dict[key] > 1:
             if neg_dict.has_key(key):
                 if neg_dict[key] / pos_dict[key] < 2:
                     pos_set.add(key)
             else:
                 pos_set.add(key)
     self.pos_set = pos_set
     print '完成!'
     return
Example #23
0
 def get_whole(self, sentence):
     opinion_dict = dict()
     pos_f = open('../opinion-lexicon-English/positive-words.txt', 'rb')
     neg_f = open('../opinion-lexicon-English/negative-words.txt', 'rb')
     for _ in xrange(35):
         pos_f.readline()
         neg_f.readline()
     for word in pos_f:
         opinion_dict[word.strip()] = True
     for word in neg_f:
         opinion_dict[word.strip()] = False
     pos_f.close()
     neg_f.close()
     stemmer = PorterStemmer()
     stanford_parser = parser.Parser()
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     w = open('sentence_test', 'wb')
     text_token = self.tf.stanford_tokenize(sentence)
     text_pos = stanford_tagger.tag(text_token)
     print text_pos
     text_dependency = stanford_parser.parseToStanfordDependencies(sentence)
     temp_list = ['none'] * len(text_token)
     for dep in text_dependency:
         if dep[0] == 'amod':
             temp_list[int(dep[1])] = '%s_1' % dep[0]
             temp_list[int(dep[2])] = '%s_2' % dep[0]
     #end for
     for num, item in enumerate(text_pos[0]):
         temp_str = 'order'
         if opinion_dict.has_key(item[0]):
             temp_str = 'opion'
         featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\
                       temp_str,temp_list[num],'O']
         w.write(' '.join(featrue_list) + '\n')
     pass
#!/usr/bin/env python
# -*- coding: utf-8 -*

import numpy
import nltk
from nltk.tag.stanford import POSTagger
import sys

if len(sys.argv) != 2:
    print 'must have one argument'
    sys.exit()

chunk = sys.argv[1].decode('utf-8')
#chunk = u"妈我"

text = nltk.word_tokenize(chunk.encode('utf-8'))
st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')

tsentence = st.tag(text)
# print tsentence
for w in tsentence:
    # print w
    # print w[1].decode('utf-8'),
    print w[1].split('#')[1]
Example #25
0
def pos_stanford(tokens):

    tagger = POSTagger('./english-bidirectional-distsim.tagger',
                       './stanford-postagger.jar')
    return tagger.tag(tokens)
Example #26
0
import os
from nltk import *
from nltk.tag.stanford import POSTagger
from nltk.stem.wordnet import WordNetLemmatizer
PATH_TO_TAGGER = os.path.join(os.getcwd(), "lib\\wsj-0-18-bidirectional-nodistsim.tagger")
PATH_TO_JAR = os.path.join(os.getcwd(), "lib\\stanford-postagger.jar")
print pos_tag("the sea touches me".split())
stanford_tagger = POSTagger(PATH_TO_TAGGER,PATH_TO_JAR)
print stanford_tagger.tag(word_tokenize("which ocean touches the state of California ?"))
# POS tagging 
>>>import nltk
>>>from nltk import word_tokenize
>>>s="I was watching TV"
>>>print nltk.pos_tag(word_tokenize(s))

# all nouns

>>>tagged=nltk.pos_tag(word_tokenize(s))
>>>allnoun=[word for word,pos in tagged if pos in ['NN','NNP'] ]

# Stanford POS tagger 

>>>from nltk.tag.stanford import POSTagger
>>>import nltk
>>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar')
>>>tokens =nltk.word_tokenize(s)
>>>stan_tagger.tag(tokens)

# POS tags freq distribtuion
>>>from nltk.corpus import brown
>>>import nltk
>>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
>>>print nltk.FreqDist(tags)

# default tagger
>>>brown_tagged_sents = brown.tagged_sents(categories='news')
>>>default_tagger = nltk.DefaultTagger('NN')
>>>print default_tagger.evaluate(brown_tagged_sents)

# N-gram taggers
from scipy.sparse import hstack
import os

__author__ = 'Jasneet Sabharwal'

_POS_TAGGER_MODEL_PATH = os.path.join(
    os.path.dirname(__file__), '..', '..',
    'lib/english-bidirectional-distsim.tagger')
_POS_TAGGER_JAR_PATH = os.path.join(os.path.dirname(__file__), '..', '..',
                                    'lib/stanford-postagger.jar')
_SENTI_WORDNET_FILE_PATH = os.path.join(os.path.dirname(__file__), '..', '..',
                                        'lib/SentiWordNet_3.0.0_20130122.txt')
_BOW_VOCAB_PATH = os.path.join(os.path.dirname(__file__), '..', '..',
                               'lib/bow_vocab')

POS_TAGGER = POSTagger(_POS_TAGGER_MODEL_PATH, _POS_TAGGER_JAR_PATH)
SENTI_WORDNET = SentiWordNetCorpusReader(_SENTI_WORDNET_FILE_PATH)
BOW_VECTORIZER = CountVectorizer(
    min_df=1,
    binary=True,
    dtype='float64',
    lowercase=True,
    ngram_range=(1, 1),
    stop_words=stopwords.words('english'),
    vocabulary=utils.get_bow_vocab(_BOW_VOCAB_PATH))


def _pos_features(pos_tags):
    pos_tags = [(word, tag) for (word, tag) in pos_tags
                if not word.lower() in stopwords.words('english')]
    features = defaultdict(int)
Example #29
0
from nltk.tag.stanford import POSTagger
import textprocess as tp
import os, time

#Wraps the part of speech taggin functionality within this file

try:
    pwd = os.path.dirname(os.path.realpath(__file__))
    print pwd
except:
    print 'Something screwed up, using os.getcwd() instead'
    pwd = os.getcwd()
    
print "POSTagger Loaded"
post = POSTagger(pwd+'/stanford-postagger/models/english-bidirectional-distsim.tagger',
                 pwd+"/stanford-postagger/stanford-postagger.jar")

def tag(text):
    text = tp.preprocess(text)
    #print text
    t1 = time.time()
    outlist = post.tag(text.split())
    t2 = time.time()
    print "POS Tagging complete. Time taken: ", t2-t1, " seconds"
    return outlist
def evaluate(granularity, text):

    preprocessor = Preprocessor()
    entry = TextEntry()
    entry.body = text
    preprocessor.entries = [entry]

    data = preprocessor.get_clean_data()
    ncharsAll = preprocessor.getNChars(items=data, freq=20)

    test_data_raw = preprocessor.get_clean_data()
    test_raw_text = preprocessor.get_raw_words()

    count_vect = joblib.load('../models/t1/vec_count.joblib')
    tfidf_transform = joblib.load('../models/t1/tfidf_transform.joblib')

    data_counts = count_vect.transform(test_data_raw)
    test_data = tfidf_transform.transform(data_counts)

    dense_test = test_data.toarray()

    vocab = count_vect.vocabulary_
    nchars = []
    for nchar in ncharsAll:
        if nchar not in vocab:
            nchars.append(nchar)

    numOfTags = len(tags)
    ncharVecSize = len(nchars)

    tag_vecs = []
    pos = POSTagger(model, jar, java_options='-mx2500m')
    for i, text in enumerate(test_raw_text):
        if i % 10 == 0:
            print(i)
        words = text.split()
        tag_vector = np.zeros(numOfTags)
        words_with_tags = pos.tag(words)
        only_tags = [tag for word, tag in words_with_tags[0]]
        tags_with_freq = Counter(only_tags)
        for tag, freq in tags_with_freq.items():
            tag_vector[tags.index(tag)] = freq / len(words)
        tag_vecs.append(tag_vector)

    for i, text in enumerate(test_raw_text):
        if i % 100 == 0:
            print(i)
        words = text.split()
        ncharVec = np.zeros(ncharVecSize)
        for word in words:
            for size in sizes:
                text_nchars = [
                    word[i:i + size] for i in range(len(word) - size + 1)
                ]
                text_nchars_with_freq = Counter(text_nchars)
                for nchar, freq in text_nchars_with_freq.items():
                    if nchar in nchars:
                        ncharVec[nchars.index(nchar)] = freq / len(words)

        test_data[i] = np.concatenate((dense_test[i], ncharVec, tag_vecs[i]))

    svm_l = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' +
                        granularity + '.joblib')
    svm_u = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' +
                        granularity + '.joblib')

    evaluator = ClfEval(svm_l, svm_u)
    return evaluator.eval_data(csr_matrix(test_data))