Example #1
0
def stanford_corenlp_filter(sent):
  from nltk.tag.stanford import POSTagger
  posTagger = POSTagger('/Users/gt/Downloads/'
                        'stanford-postagger-2013-06-20/models/'
                        'wsj-0-18-bidirectional-nodistsim.tagger',
                        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
                        '/stanford-postagger-3.2.0.jar',encoding=encoding)

  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1 = b1.lower()
  tokens = word_tokenize(b1)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

      #note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2 = b2.lower()
  tokens = word_tokenize(b2)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  return filtered_sent
Example #2
0
def stanford_corenlp_filter(sent):
    from nltk.tag.stanford import POSTagger
    posTagger = POSTagger(
        '/Users/gt/Downloads/'
        'stanford-postagger-2013-06-20/models/'
        'wsj-0-18-bidirectional-nodistsim.tagger',
        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
        '/stanford-postagger-3.2.0.jar',
        encoding=encoding)

    b1, b2 = sent.split(blockSeparator)
    b2 = b2.rstrip()

    b1 = b1.lower()
    tokens = word_tokenize(b1)
    pos_tags = posTagger.tag(tokens)
    filtered_sent = ' '
    for pos_t in pos_tags:
        if pos_t[1] in filterList:
            # filtered_sent += stemmer.stem(pos_t[0]) + ' '
            filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

            #note: 1 concat stemmer(word) == stemmer(1 concat word)

    b2 = b2.lower()
    tokens = word_tokenize(b2)
    pos_tags = posTagger.tag(tokens)
    filtered_sent = ' '
    for pos_t in pos_tags:
        if pos_t[1] in filterList:
            # filtered_sent += stemmer.stem(pos_t[0]) + ' '
            filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

    return filtered_sent
Example #3
0
File: tmw.py Project: daschloer/tmw
def nltk_stanfordpos(inpath, outfolder):
    """POS-Tagging French text with Stanford POS-Tagger via NLTK."""
    print("\nLaunched nltk_stanfordpos.")

    import os
    import glob
    from nltk.tag.stanford import POSTagger

    for file in glob.glob(inpath):
        st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
        with open(file, "r", encoding="utf-8") as infile:
            untagged = infile.read()
            tagged = st.tag(untagged.split())

            taggedstring = ""
            for item in tagged:
                item = "\t".join(item)
                taggedstring = taggedstring + str(item) + "\n"
            #print(taggedstring)

            basename = os.path.basename(file)
            cleanfilename = basename
            if not os.path.exists(outfolder):
                os.makedirs(outfolder)
            with open(os.path.join(outfolder, cleanfilename),"w") as output:
                output.write(taggedstring)
    print("Done.")
Example #4
0
def vectorizer(tokens, w2v_db):
    db_path = w2v_db
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)
    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
            
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label
    # Get the vectors of words. Maintain order as in document.
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
    conn.close()
    return unsorted_kw, token_vecs
Example #5
0
def cleanTokens(tokens):


    st = POSTagger('/models/german-fast.tagger')

    tags = st.tag(tokens);
    def cleanTags(x):
        y = x[1]
        return True if re.match("NE|NN",y) and len(x[0]) > 3 else False

    clean_tags= filter(cleanTags,tags)

    #import pdb;pdb.set_trace();


    def buildSentens(arr):
        list = []
        sen =""
        for i in arr:
            list.append(i[0])
        return list



    #print len(clean_tags)
    #print clean_tags
    clean =  buildSentens(clean_tags)

    return clean
Example #6
0
def main():

    st = POSTagger(
        "/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger",
        "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar",
    )

    # st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \
    # "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar")

    # print st.tag("Die Kinder in Bayern haben lange Ferien".split())

    # return

    with open(sys.argv[1], "r") as f:
        content = f.read()

    sentences = re.split("\n|\.|\?", content)

    for s in sentences:
        if len(s) == 0:
            continue
        # print s
        pieces = st.tag(s.split())
        strippedPieces = stripPieces(pieces)

        print " ".join(strippedPieces)
Example #7
0
def postext_st(filename):
    # Opening of File
    path_to_raw = '/home/cyneo/Work/Scans/Text Version/'

    if type(filename) != str:
        raise IOError('Filename must be a string')

    # Preparing to Tokenize
    with open(osp.abspath(path_to_raw + filename + '.txt'),
              'r', encoding='utf8') as raw:
        # Initialize the punkt module
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = []

        for line in raw:
            sents.extend(sent_detector.tokenize(line.strip()))
    
    tokenedsents = []
    # Tokenizing
    from nltk.tokenize.stanford import StanfordTokenizer
    for line in sents:
        tokenedsents.append(StanfordTokenizer().tokenize(line))

    # Parts of Speech Tagging
    posSents = []
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
                   encoding='utf8')

    for line in tokenedsents:
        # Returns a list of a list of tuples
        posSents.append(st.tag(line))

    return posSents
Example #8
0
File: tmw.py Project: daschloer/tmw
def nltk_stanfordpos(inpath, outfolder):
    """POS-Tagging French text with Stanford POS-Tagger via NLTK."""
    print("\nLaunched nltk_stanfordpos.")

    import os
    import glob
    from nltk.tag.stanford import POSTagger

    for file in glob.glob(inpath):
        st = POSTagger(
            '/home/christof/Programs/stanfordpos/models/french.tagger',
            '/home/christof/Programs/stanfordpos/stanford-postagger.jar',
            encoding="utf8")
        with open(file, "r", encoding="utf-8") as infile:
            untagged = infile.read()
            tagged = st.tag(untagged.split())

            taggedstring = ""
            for item in tagged:
                item = "\t".join(item)
                taggedstring = taggedstring + str(item) + "\n"
            #print(taggedstring)

            basename = os.path.basename(file)
            cleanfilename = basename
            if not os.path.exists(outfolder):
                os.makedirs(outfolder)
            with open(os.path.join(outfolder, cleanfilename), "w") as output:
                output.write(taggedstring)
    print("Done.")
def createModel():
    global classifierit
    global classifierloose
    global classifieryou
    global classifierto
    global classifiertheir
    trainingitSet = []
    traininglooseSet = []
    trainingyouSet = []
    trainingtoSet = []
    trainingtheirSet= []
    st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
    for line in brown.sents():
        print line
        tagSent = st.tag(line)
        print tagSent
        arrayOfitFeature = pos_itfeatures(tagSent)
        arrayOfyouFeature = pos_youfeatures(tagSent)
        arrayOftheirFeature = pos_theirfeatures(tagSent)
        arrayOflooseFeature = pos_loosefeatures(tagSent)
        arrayOftoFeature = pos_tofeatures(tagSent)
        if arrayOfitFeature:
            trainingitSet.extend(arrayOfitFeature)
        if arrayOftheirFeature:
            trainingtheirSet.extend(arrayOftheirFeature)
        if arrayOflooseFeature:
            traininglooseSet.extend(arrayOflooseFeature)
        if arrayOftoFeature:
            trainingtoSet.extend(arrayOftoFeature)
        if arrayOfyouFeature:
            trainingyouSet.extend(arrayOfyouFeature)
        
    
    algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
    #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
    classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
    f = open('classifierit.pickle', 'wb')
    pickle.dump(classifierit, f)
    f.close()
    #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
    classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
    f = open('classifierloose.pickle', 'wb')
    pickle.dump(classifierloose, f)
    f.close()
    #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
    classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
    f = open('classifieryou.pickle', 'wb')
    pickle.dump(classifieryou, f)
    f.close()
    #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
    classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
    f = open('classifierto.pickle', 'wb')
    pickle.dump(classifierto, f)
    f.close()
    #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
    classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
    f = open('classifiertheir.pickle', 'wb')
    pickle.dump(classifiertheir, f)
    f.close()      
Example #10
0
def stanford_tag(sentence):
    ''' use stanford tagger to tag a single tokenized sentence
    '''
    import src.experiment.path as path
    tagger = POSTagger(path.stanford_tagger_model_path(),
                       path.stanford_tagger_path(),
                       java_options='-Xmx16g -XX:MaxPermSize=256m')
    return tagger.tag(sentence)
Example #11
0
def tag(segments):
    #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
    st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'),
                   os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
    tagged = []
    for segment in segments:
        x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
        tagged.append(x.decode('utf-8'))
    return tagged
Example #12
0
def spanish_pos(text):
	""" Parts of speech tagger for Spanish """
	
	text = text.encode('utf8')

	st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger', 
				'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')

	pos_tagged = st.tag(text.split())

	return pos_tagged  
Example #13
0
def german_pos(text):
	""" Parts of speech tagger for German """
	
	text = text.encode('utf8')

	st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger', 
				'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')

	pos_tagged = st.tag(text.split())

	return pos_tagged  
Example #14
0
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger',
                       'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w, t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs) / 4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data, nk, iter=20, minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir, name + '.json')
    file_dest = open(json_path, 'w')
    json.dump(
        {
            'url': url,
            'vectors': token_vecs,
            'keyword_frequency': unsorted_kw,
            'centroids': centroids
        }, file_dest)
    file_dest.close()
Example #15
0
class Tagger():
    def __init__(self):
        self.st = POSTagger(
            os.path.normpath(
                os.path.dirname(os.path.realpath(__file__)) +
                '/stanford-pos/models/english-bidirectional-distsim.tagger'),
            os.path.normpath(
                os.path.dirname(os.path.realpath(__file__)) +
                '/stanford-pos/stanford-postagger.jar'))

    def tag(self, line):
        return self.st.tag(line.split())
Example #16
0
def pos_tag(to_tag,
            model_path=root_path +
            "\\stanford-postagger-full-2013-06-20\\models\\french.tagger",
            jar_path=root_path +
            "\\stanford-postagger-full-2013-06-20\\stanford-postagger.jar"):
    '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file'''
    pos_tagger = POSTagger(
        model_path, jar_path, encoding='utf8'
    )  #create an object of class POSTagger that is encoded in UTF-8
    tags = pos_tagger.tag(
        to_tag)  #run the tagging algorithm on the tokenized raw text
    return tags
Example #17
0
def tag(segments):
    #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
    st = POSTagger(
        os.path.join(stanford_path,
                     'models/english-left3words-distsim.tagger'),
        os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
    tagged = []
    for segment in segments:
        x = ' '.join(
            nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
        tagged.append(x.decode('utf-8'))
    return tagged
Example #18
0
def main():

    print "Inicio..."
    with open("tweets_a_procesar_v2.csv", 'rb') as csvfile:
        lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'")
        # En esta variable estan todos los tweets
        tweets = []
        for line in lines:
            tweet = Tweet(line)
            #print tweet.spanish_text.split()
            tweets.append(tweet)
        
    #archivo de salida
    output = open("output_tagged_v2.csv", 'wb')
    filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'")

    #importando el tagger en español de Stanford NLP
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
    #st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
    #st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8')

    n=0
    for tweet in tweets:
        n+=1
        print tweet.spanish_text
        #Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split())
        tweet_tagged = st.tag((tweet.spanish_text).split())
        #Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        #print tweet_tagged

        important_words = []
        n_adj = 0
        for tag in tweet_tagged:
            inicial = tag[1][:1]
            if('a' in inicial):
                important_words.append(tag[0])
            if('r' in inicial):
                important_words.append(tag[0])
            if('n' in inicial):
                important_words.append(tag[0])
            if('v' in inicial):
                important_words.append(tag[0])

        #tweet.cant_adj = n_adj
        tweet.tweet_tagged = tweet_tagged
        tweet.important_words = important_words
        filewriter.writerow(tweet.to_CSV())
        if n % 100 == 0: print n
    print "Done"
    output.close()
Example #19
0
class yagoScores:
    def __init__(self):
        None
        self.en_postagger = POSTagger('parser/models/english-bidirectional-distsim.tagger', 'parser/stanford-postagger.jar')
    
    def parse(self,text):
        return self.en_postagger.tag(text.split())
        
    def get_underscoreWords(self,text):
        return re.findall("[a-z]+_[a-z]+", text)
    
    def findNounsSeq(self,tuples):
        self.noun = []    
        self.nouns = []
        prev = ""
        for each in tuples:
            if(each[1]=="NN"):
                self.noun.append(each[0])
            if(each[1]=="NNS"):
                self.nouns.append(prev+" "+each[0])
                prev = prev+" "+each[0]
            else:
                prev = each[0]
    
    def searchInWiki(self,guessess):
        #text = " ".join(self.noun)+" ".join(self.nouns)  
        text = " ".join(self.nouns) 
        print text  
        links = wikipedia.search(text)
        print ("LINKS")
        print links    
        for link in links:
            page = wikipedia.page(link)
            print page.title
            # check if guess appears in that page
            for eachg in guessess:
                print eachg.replace("_", " ").lower()
                if(eachg.replace("_", " ").lower() in page.content.lower()):
                    print "founddddddddddddddddddddd"
                    self.freq[eachg] += 1
    
    # Call getScore(self,text,guessess)function from outside, returns dict of scores of wiki appearances
    def getScore(self,text,guessess):
        self.freq = defaultdict(int)
        tuples = self.parse(text)
        print tuples
        self.findNounsSeq(tuples)
        self.searchInWiki(guessess)
        print self.freq
        return self.freq
def pos_tag_stanford(toked_sentence):
	"""
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

	from nltk.tag.stanford import POSTagger
	st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', 
               '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar')

	return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger('english-bidirectional-distsim.tagger',
                   'stanford-postagger.jar')

    return st.tag(toked_sentence)
Example #22
0
class StanfordTagger(WorkflowNativePOSTagger):

    def __init__(self, xml):
        from nltk.tag.stanford import POSTagger
        import os
        super(StanfordTagger, self).__init__(xml)
        self.tagger = POSTagger(os.path.join(os.getcwd(),'External/english-bidirectional-distsim.tagger'), os.path.join(os.getcwd(),'External/stanford-postagger.jar'))

    def is_ascii(self, s):
        return all(ord(c) < 128 for c in s)

    def tokenize(self, document):
        # Non ASCII characters makes the stanford tagger go crazy and run out of heap space
        if self.is_ascii(document):
            for word, tag in self.tagger.tag(document):
                    yield "%s/%s" % (word, tag)
Example #23
0
def pos_tag(sent, tagger='stanford'):
    
    # saves pos_tagger as global variable,
    # such that it is not recreated everytime pos_tag is executed
    if not 'pos_tagger' in globals():
        global pos_tagger
        pos_tagger = POSTagger(conf.stanford_pos_model, path_to_jar=conf.stanford_postagger, encoding='UTF-8')

    if tagger == 'nltk' :
        tokens = tokenize(sent, 's')
        return nltk.pos_tag(tokens)
    elif tagger == 'stanford' :
        tokens = tokenize(sent,'w')
        return pos_tagger.tag(tokens)
    else :
        raise ValueError('No such tagger: ' + tagger)
Example #24
0
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs)/4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data,nk,iter=20,minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir,name+'.json')
    file_dest = open(json_path, 'w')
    json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest)
    file_dest.close()
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples8qfa

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger(
        '/home/satyam/zip/opinionproject/opinion_mining/resources/english-bidirectional-distsim.tagger',
        '/home/satyam/zip/opinionproject/opinion_mining/resources/stanford-postagger.jar'
    )

    return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger(
        '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger',
        '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar'
    )

    return st.tag(toked_sentence)
def stan_pos(input_sent):
    """
    This function calls stanford POS tagger.In this function Stanford POS tagger directory must be in the same directory.And this function chooses model "wsj left 3 words" as normal POS tagging model. If  you want to use other POS tagging models, please change first argument of st = POSTagger() below.

    """
    eval_sent = []

    st = POSTagger("./stanford-postagger-2012-11-11/models/wsj-0-18-left3words.tagger","./stanford-postagger-2012-11-11/stanford-postagger.jar")

    pos_result = st.tag(input_sent.split())
    for one_tuple in pos_result:
        pos_format = one_tuple[0] + "_" + one_tuple[1]
        
        eval_sent.append(pos_format)

    eval_sent = reg_form(eval_sent)
    return eval_sent
Example #28
0
def main():
    dict2 = readDict("dict2.txt")
    sentences2 = readSentences("sentences2.txt")
    translated2 = translate(sentences2, dict2)
    print "======================================BASE TRANSLATION=========================================="
    for sentence in translated2:
        print sentence

    print "================================================================================================"

    st = POSTagger('stanford-postagger/models/english-left3words-distsim.tagger',
        'stanford-postagger/stanford-postagger.jar')
    POS = []
    for sentence in translated2:
        tagged = st.tag(sentence.split())
        if (len(tagged)>0):
            POS.append(tagged)

    POS = stupidFixes(POS)
    print "==================================STUPID FIXES TRANSLATION======================================"
    for sentence in POS:
#        print sentence # '[%s]' % ', '.join(map(str, sentence))
        print ' '.join(map(getWord, sentence))


    POS = rulesOneThree(POS)
    print "=====================================RULE1+3 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))

    POS = rulesFourFiveSeven(POS)
    print "=====================================RULE4+5+7 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))

    POS = ruleTwoNine(POS)
    POS = ruleTwoNine(POS) # apply twice
    print "=====================================RULE2+9 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))

    POS = ruleSixEight(POS)
    print "=====================================RULE6+8 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))
Example #29
0
	def get_transactions(self, product_reviews):
		'''
			Generates a set of transactions ready for frequent itemset mining
			from the crawled product reviews
		'''
		pos_tagger = POSTagger(PATHS['POS_MODEL'], PATHS['POS_TAGGER'])

		pos_output = []
		transactions_output = []

		print 'Generating transactions...'
		product_count = 0
		sentence_count = 0
		for product in product_reviews:
			sentences = sent_tokenize(product)
			for sentence in sentences:
				try:
					sent_pos = pos_tagger.tag(word_tokenize(sentence))
				except UnicodeEncodeError:
					continue
				trans = []
				pos_tags = []
				for word, pos in sent_pos:
					pos_tags.append(':'.join([word, pos]))
					if ((pos == 'NN' or pos == 'NNS' or pos == 'NP') and
						re.match('^[A-Za-z0-9-]+$', word)):
						trans.append(word.lower())
				if trans:
					pos_output.append([sentence] + pos_tags)
					transactions_output.append([sentence] + trans)
					sentence_count += 1
			product_count += 1

			print '---%s Reviews and %s Transactions Parsed---' % (
				product_count,
				sentence_count
			)

		write_csv(PATHS['POS'], pos_output)
		write_csv(PATHS['TRANSACTIONS'], transactions_output)

		print 'Finished generating transactions...'
def main(word_transformation = None, result_path = None, save = SAVE, n = 500):
    tagger = POSTagger('/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger',
                       '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar')
    
    tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:]
    
    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w,t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w,t in sent]
             for sent in tagged_corpus)
    
    correct_tags = [[t for w,t in sent]
                    for sent in tagged_corpus]
    
    print "predicting"
    predicted_tags = []
    really_correct_tags = [] # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i+1) % 5 == 0:
            print "%d finished" %(i+1)
        try:
            ptags = [t for w,t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" %(sent)                
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent
        except Exception:
            traceback.print_exc()

    if save:
        print "dumping to '%s'" %(result_path)
        dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
Example #31
0
def main(word_transformation=None, result_path=None, save=SAVE, n=500):
    tagger = POSTagger(
        '/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger',
        '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar')

    tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:]

    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w, t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w, t in sent] for sent in tagged_corpus)

    correct_tags = [[t for w, t in sent] for sent in tagged_corpus]

    print "predicting"
    predicted_tags = []
    really_correct_tags = []  # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i + 1) % 5 == 0:
            print "%d finished" % (i + 1)
        try:
            ptags = [t for w, t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" % (sent)
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent
        except Exception:
            traceback.print_exc()

    if save:
        print "dumping to '%s'" % (result_path)
        dump((really_correct_tags, predicted_tags, sentences),
             open(result_path, "w"))
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    import os

    basePath = os.getcwd()
    st = POSTagger(
        path +
        '/resources/stanford-postagger-2015-12-09/models/english-bidirectional-distsim.tagger',
        path +
        '/resources/stanford-postagger-2015-12-09/stanford-postagger.jar')

    return st.tag(toked_sentence)
def stanfordTag(modelPath,stanfordJarPath,text,encoding):

    if not bool(re.search("java.exe", os.getenv("JAVA_HOME"))):
        java_path=os.getenv("JAVA_HOME")+"/bin/java.exe"
        os.environ['JAVA_HOME'] = java_path
        print(java_path)
        nltk.internals.config_java(java_path)
    entities = []
    stemmer = SnowballStemmer("french")
    st = POSTagger(modelPath,stanfordJarPath,encoding) 
    print(text.split())
    tags=st.tag(text.split())
    print(tags)
    for tag in tags[0]:           
        entity = {
        'token': tag[0],
        'pos': tag[1],
        'stemm' : stemmer.stem(tag[0])       
        }
        entities.append(entity)
    return entities
Example #34
0
def tag_tokens(tokens):
    tagged_sents = []
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
                   encoding='utf8')

    print('Starting to tag sentences')
    """
    Progress Bar:
    """
    toolbar_width = 40

    # setup toolbar
    sys.stdout.write("[%s]" % (" " * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))
    # return to start of line, after '['

    no_of_sents = len(tokens)
    no_of_ticks = 0
    sent_counter = 0

    for line in tokens:
        # Returns a list of a list of tuples
        tagged_sents.append(st.tag(line))

        # Updating bar
        sent_counter += 1
        trigger = (sent_counter * toolbar_width - 1) / no_of_sents
        if trigger >= no_of_ticks:
            while no_of_ticks < math.floor(trigger):
                sys.stdout.write("-")
                sys.stdout.flush()
                no_of_ticks += 1

    sys.stdout.write(">]\n")
    print('Done tagging')

    return tagged_sents
Example #35
0
 def get_whole(self, sentence):
     opinion_dict = dict()
     pos_f = open('../opinion-lexicon-English/positive-words.txt', 'rb')
     neg_f = open('../opinion-lexicon-English/negative-words.txt', 'rb')
     for _ in xrange(35):
         pos_f.readline()
         neg_f.readline()
     for word in pos_f:
         opinion_dict[word.strip()] = True
     for word in neg_f:
         opinion_dict[word.strip()] = False
     pos_f.close()
     neg_f.close()
     stemmer = PorterStemmer()
     stanford_parser = parser.Parser()
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     w = open('sentence_test', 'wb')
     text_token = self.tf.stanford_tokenize(sentence)
     text_pos = stanford_tagger.tag(text_token)
     print text_pos
     text_dependency = stanford_parser.parseToStanfordDependencies(sentence)
     temp_list = ['none'] * len(text_token)
     for dep in text_dependency:
         if dep[0] == 'amod':
             temp_list[int(dep[1])] = '%s_1' % dep[0]
             temp_list[int(dep[2])] = '%s_2' % dep[0]
     #end for
     for num, item in enumerate(text_pos[0]):
         temp_str = 'order'
         if opinion_dict.has_key(item[0]):
             temp_str = 'opion'
         featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\
                       temp_str,temp_list[num],'O']
         w.write(' '.join(featrue_list) + '\n')
     pass
Example #36
0
 def get_whole(self,sentence):
     opinion_dict = dict();
     pos_f = open('../opinion-lexicon-English/positive-words.txt','rb');
     neg_f = open('../opinion-lexicon-English/negative-words.txt','rb');
     for _ in xrange(35):
         pos_f.readline();
         neg_f.readline();
     for word in pos_f:
         opinion_dict[word.strip()]=True;
     for word in neg_f:
         opinion_dict[word.strip()]=False;
     pos_f.close();
     neg_f.close();
     stemmer = PorterStemmer();
     stanford_parser = parser.Parser();
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
     w = open('sentence_test','wb');
     text_token = self.tf.stanford_tokenize(sentence);
     text_pos = stanford_tagger.tag(text_token);
     print text_pos;
     text_dependency = stanford_parser.parseToStanfordDependencies(sentence);
     temp_list = ['none']*len(text_token);
     for dep in text_dependency:
         if dep[0] == 'amod':
             temp_list[int(dep[1])]='%s_1'%dep[0];
             temp_list[int(dep[2])]='%s_2'%dep[0];
     #end for
     for num,item in enumerate(text_pos[0]):
         temp_str = 'order';
         if opinion_dict.has_key(item[0]):
             temp_str = 'opion';
         featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\
                       temp_str,temp_list[num],'O'];
         w.write(' '.join(featrue_list)+'\n');
     pass;
 def extract_examples(self):
     training_tuples = set()
     db_fh = open(self.database_loc, 'rb')
     for line in db_fh: #going through PPDB
         elements = line.strip().split(' ||| ')
         if len(elements[1].split()) == 2 or len(elements[2].split()) == 2: #only look at 2-to-1 or 1-to-2 paraphrases
             many_phrase = elements[1] if len(elements[1].split()) == 2 else elements[2]
             one_phrase = elements[1] if len(elements[1].split()) == 1 else elements[2]
             if self.filter_number: #filter numbers, these are useless
                 isNumber = False
                 for token in many_phrase.split():
                     if self.pos_provided:
                         token = token.split('#')[0]
                     if self.is_number(token):
                         isNumber = True
                 if not isNumber:
                     training_tuples.add((one_phrase, many_phrase))
             else:
                 training_tuples.add((one_phrase, many_phrase))
     tagger = POSTagger(self.TAGGER_MODEL, self.TAGGER_LOC)
     self.training_examples = {} #reset training examples
     for element in training_tuples: #now, tag the resulting data
         words = element[1].split()
         words_only = ""
         if self.pos_provided: #if pos tags provided externally can just merge them here otherwise call the tagger
             words_only = ' '.join([word_pos.split('#')[0] for word_pos in words])
         pos_tags = [word_pos.split('#')[1] for word_pos in words] if self.pos_provided else [word_pos[1] for word_pos in tagger.tag(words)]            
         collapsed_pos = []
         for pos in pos_tags: #cluster certain pos tags together
             new_pos = collapsePOS(pos)
             collapsed_pos.append(new_pos)
         key = ' '.join(collapsed_pos)
         examples = self.training_examples[key] if key in self.training_examples else []
         if self.pos_provided:
             examples.append(' '.join([element[0], words_only]))
         else:
             examples.append(' '.join([element[0], element[1]]))
         self.training_examples[key] = examples
     sys.stderr.write("PPDB training data tagged and sorted\n")
     db_fh.close()
Example #38
0
import os
from nltk import *
from nltk.tag.stanford import POSTagger
from nltk.stem.wordnet import WordNetLemmatizer
PATH_TO_TAGGER = os.path.join(os.getcwd(), "lib\\wsj-0-18-bidirectional-nodistsim.tagger")
PATH_TO_JAR = os.path.join(os.getcwd(), "lib\\stanford-postagger.jar")
print pos_tag("the sea touches me".split())
stanford_tagger = POSTagger(PATH_TO_TAGGER,PATH_TO_JAR)
print stanford_tagger.tag(word_tokenize("which ocean touches the state of California ?"))
class PersianPipeline:

	def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir):
		
		
		
		try:
			self.logger = logging.getLogger(__name__)
			self.posTagger = POSTagger(posTagModelPath, posTaggerPath,encoding="UTF-8", java_options='-Xmx16000m')
			#self.posTagger = POSTagger(posTagModelPath, posTaggerPath,"UTF-8")
			#print "pos tagger is loaded"
		except:
			self.logger.warning("Error in loading POS tagger!")
			e = sys.exc_info()[0]
			self.logger.warning("Error:" + str(e))
					
		
		try:
			self.parser = MaltParser(tagger=None, mco = parserModelPath, working_dir= workingDir, additional_java_args=['-Xmx16000m']) 
			#print "parser is loaded"
		except:
			self.logger.warning("Error in loading the MALT Parser")
			e = sys.exc_info()[0]
			self.logger.warning("Error:" + str(e))				
	
	# tokenizes, fixes some of the detached affixes
	def preprocess(self, s):
		# remove the diacritics
		drs = s
		for c in range(1611, 1619):
			drs = drs.replace(unichr(c),"")
		# normalize the Arabic yaa
		drs = drs.replace(unichr(1610),unichr(1740))
		drs = drs.replace(unichr(1609),unichr(1740))
		
		# tokenize the sentence
		ts = self.seperatePuncs(drs)
		# fix the affixes
		afs = self.fixAffixes(ts)
		
		# replace slashes and pounds and underlines
		afs= afs.replace("#","-")
		afs= afs.replace("/","-")
		afs = afs.replace("_","-")
		return afs
	
	def preprocess4Annotation(self, s):
		ps = self.preprocess(s)
		ts = self.posTagASentence(ps)
		if ts:
			print "pos tagged"
		else:
			print "tagging failed"
		attS = self.attachPerCompounds(ts)
		#" ".join(attS.split)
		# get the first element of pos tuples and join them to form the sentence
		# replace the ^ sign (from compound attaching) with space
		finalS = " ".join(map(lambda x: x[0].replace("^"," "), attS))
		return finalS
	# tokenize a persian sentence
	def seperatePuncs(self, s):
		
		s = re.sub(ur"([\[{\(\\`\"‚„†‡‹‘’“”•\.–—›««])", r"\1 ", s)
		s = re.sub(ur"([\]}\'\`\"\),;:!\?\%‚„…†‡‰‹‘’“”•–—›»\.])", r" \1", s)
		# persian specific
		s = re.sub(ur"([،؛؟،\.])", r" \1 ", s)
		s = s.replace("  ", " ")
		return s
	
	

	def fixAffixes(self, sent):
		suffList = [u"ها", u"های"]
		sSent = sent.split(" ")
		newTokSent = []
		sentLen = len(sSent)
		i = 0
		try:
			while i < sentLen:
	
				if sSent[i] in suffList and newTokSent:
					#print "+++ Affix problem got fixed"
					# attach the suffix to the previous word
					newTokSent[-1] = newTokSent[-1] + u"\u200c" + sSent[i]
				else:
					newTokSent.append(sSent[i])
				i += 1
			return " ".join(newTokSent)
		except:
			return sent 

	
		
	def posTagASentence(self, sent):
		try:
			sent = sent.replace("/","-")
			posSent = self.posTagger.tag(sent.split())
			return posSent
		except:
			self.logger.warning("problem in pos!" + sent)
			return None

	# Function reads in a POS tagged sentence (list) and if there are two adjacent verbs, it attaches them together and make them one word.
	def attachPerCompounds(self, posSent):
			
		prFlag = False
		ct = senCt = prCt = 0
		i = 0
		senCt += 1
		pos = wd = outWd = ""
		sentLen = len(posSent)
		newPOSSent = []
		while i < sentLen - 1:
			ct += 1
			tok = posSent[i]
			nexTok = posSent[i+1]
			(wd, pos) = tok
			(nwd, npos) = nexTok
			outWd = wd 
			if pos == "V":
				if npos == "V":
					prFlag = True
					outWd = wd + '^' + nwd
					pos = "V"
					i += 1
			# attaching the "mi" prefix for present continious form
			if npos == "V" and wd.strip() == u"می":		
				prFlag = True
				outWd = u"می" + u"\u200c" + nwd
				pos = "V"
				i += 1
				#print "the mi case "
				#t.write("outWd:" + outWd + "\n")
				
				
			newPOSSent.append((outWd, pos))
			i += 1
		
		# don't forget the last word (if not processed)
		if i < sentLen:
			ct += 1
			tok = posSent[-1]
			newPOSSent.append(tok)
			
		# counting the lines with compound verbs patterns
		if prFlag:
			prCt += 1
		#print prCt
		
		#t.write(newPOSSent[-2][0] + "--" + newPOSSent[-1][0] + "\n")
		return newPOSSent
################################################################
		
		
	def parseATaggedSentence(self, tSent):
		try:

			compTSent = self.attachPerCompounds(tSent)
			depParse = self.parser.tagged_parse(compTSent)

			if depParse:
				pl = depParse.to_conll(10).replace("^", " ")
				return pl
			else:
				return None
			
		except Exception, e:
			print "Error in parsing a sentence!" + str(e)  
			return None
def evaluate(granularity, text):

    preprocessor = Preprocessor()
    entry = TextEntry()
    entry.body = text
    preprocessor.entries = [entry]

    data = preprocessor.get_clean_data()
    ncharsAll = preprocessor.getNChars(items=data, freq=20)

    test_data_raw = preprocessor.get_clean_data()
    test_raw_text = preprocessor.get_raw_words()

    count_vect = joblib.load('../models/t1/vec_count.joblib')
    tfidf_transform = joblib.load('../models/t1/tfidf_transform.joblib')

    data_counts = count_vect.transform(test_data_raw)
    test_data = tfidf_transform.transform(data_counts)

    dense_test = test_data.toarray()

    vocab = count_vect.vocabulary_
    nchars = []
    for nchar in ncharsAll:
        if nchar not in vocab:
            nchars.append(nchar)

    numOfTags = len(tags)
    ncharVecSize = len(nchars)

    tag_vecs = []
    pos = POSTagger(model, jar, java_options='-mx2500m')
    for i, text in enumerate(test_raw_text):
        if i % 10 == 0:
            print(i)
        words = text.split()
        tag_vector = np.zeros(numOfTags)
        words_with_tags = pos.tag(words)
        only_tags = [tag for word, tag in words_with_tags[0]]
        tags_with_freq = Counter(only_tags)
        for tag, freq in tags_with_freq.items():
            tag_vector[tags.index(tag)] = freq / len(words)
        tag_vecs.append(tag_vector)

    for i, text in enumerate(test_raw_text):
        if i % 100 == 0:
            print(i)
        words = text.split()
        ncharVec = np.zeros(ncharVecSize)
        for word in words:
            for size in sizes:
                text_nchars = [
                    word[i:i + size] for i in range(len(word) - size + 1)
                ]
                text_nchars_with_freq = Counter(text_nchars)
                for nchar, freq in text_nchars_with_freq.items():
                    if nchar in nchars:
                        ncharVec[nchars.index(nchar)] = freq / len(words)

        test_data[i] = np.concatenate((dense_test[i], ncharVec, tag_vecs[i]))

    svm_l = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' +
                        granularity + '.joblib')
    svm_u = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' +
                        granularity + '.joblib')

    evaluator = ClfEval(svm_l, svm_u)
    return evaluator.eval_data(csr_matrix(test_data))
Example #41
0
class TextParser:
        taggedText = Counter()
        tagList = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR',
                   'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS',
                   'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS', 'RP',
                   'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN',
                   'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] #Penn treebank tags
        
        tagCriteria = ('DT', 'EX', 'JJ', 'MD', 'NN',
                       'POS', 'PRP', 'RB', 'VB', 'VBD',
                       'VBG', '#', '$', "'", ',')
        stanfordTagger = None
        #config_java("C:\Program Files\Java\jdk1.6.0_37\\bin\java.exe") 

        def __init__(self, pathToParser=None, javaHeapOptions='-Xmx4g -XX:+UseParallelGC -XX:-UseGCOverheadLimit'):

                if pathToParser is None:
                        taggerLibraryPath = normpath(os.path.join(os.getcwd(), "sp/jar/stanford-postagger.jar"))
                        taggerModelPath = normpath(os.path.join(os.getcwd(), "sp/models/english-bidirectional-distsim.tagger"))
                else:
                        taggerLibraryPath = normpath(os.path.join(pathToParser, "sp/jar/stanford-postagger.jar"))
                        taggerModelPath = normpath(os.path.join(pathToParser, "sp/models/english-bidirectional-distsim.tagger"))

                self.stanfordTagger = POSTagger(taggerModelPath,
                        taggerLibraryPath, java_options=javaHeapOptions)

                """
                print "---"
                print "Tagger library path: " + taggerLibraryPath
                print "Tagger model path: " + taggerModelPath
                print "---"
                """

        def tagTextFile(self, documentName, textFilePath, useCriteria=False):
                tempTaggedText, finalList = [], []
                textFile = readFromFile(textFilePath)
                
                for line in textFile.splitlines():
                        tempTaggedText.extend(self.stanfordTagger.tag(line.split()))

                if useCriteria:
                        for x, y in tempTaggedText:
                                if y in self.tagCriteria:
                                        finalList.append((x, y))
                else:
                        for x, y in tempTaggedText:
                                finalList.append((x, y))
                                

                self.taggedText[documentName] = finalList

        def getTagCountVector(self, textString):
                splitString = textString.split()
                numberOfWords = len(splitString)
                tempTaggedText = self.stanfordTagger.tag(splitString)
                counterVector = Counter([y for x, y in tempTaggedText if y in self.tagList]) #Get tags

                resultantVector = OrderedDict()

                for k in self.tagList:
                        if k in counterVector:
                                resultantVector[k] = float(counterVector[k])/numberOfWords
                        else:
                                resultantVector[k] = 0

                return resultantVector
                

        def tagText(self, documentName, textString, useCriteria=False):
                tempTaggedText, finalList = [], []
                
                for line in textString.splitlines():
                        tempTaggedText.extend(self.stanfordTagger.tag(line.split()))
                
                if useCriteria:
                        for x, y in tempTaggedText:
                                if y in self.tagCriteria:
                                        finalList.append((x, y))
                else:
                        for x, y in tempTaggedText:
                                finalList.append((x, y))                

                self.taggedText[documentName] = finalList

        def getEmailFromString(self, emailString):
                message = Parser().parsestr(emailString)
                return (message, message.is_multipart())

        def ngram(self, textString, n=3): #Defaults to tri-gram
                return ngrams(textString.split(), n)
Example #42
0
length = 0
i = 0
for fname in os.listdir('test_data'):

    if fname.endswith('.edus'):
        print i
        print fname
        i = i + 1
        f = open(os.path.join('test_data', fname), 'r')
        mys1 = os.path.join('test_data', fname.split(".")[0] + ".pos")
        print mys1
        pos = open(mys1, "w")
        data = f.read().splitlines()

        for line in data:
            if len(line) > length:
                length = len(line)
            wordb = word_tokenize(line)
            tags = english_postagger.tag(wordb)
            pos.write(str(line.strip()))
            pos.write("@#%^&*")
            for tgpair in tags:
                pos.write(str(tgpair[1]))
                pos.write("\t")
            pos.write("\n")

        # print i
        # i=i+1
        # print length

    #  continue;
#!/usr/bin/env python
# -*- coding: utf-8 -*

import numpy
import nltk
from nltk.tag.stanford import POSTagger
import sys

if len(sys.argv) != 2:
    print 'must have one argument'
    sys.exit()

chunk = sys.argv[1].decode('utf-8')
#chunk = u"妈我"

text = nltk.word_tokenize(chunk.encode('utf-8'))
st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')

tsentence = st.tag(text)
# print tsentence
for w in tsentence:
    # print w
    # print w[1].decode('utf-8'),
    print w[1].split('#')[1]
Example #44
0
# importo nltk con el tagger spanish de Stanford
from nltk.tag.stanford import POSTagger
spanish_postagger = POSTagger('c:/stanford-postagger/models/spanish.tagger',
                              'c:/stanford-postagger/stanford-postagger.jar')

# Leeo los textos de los articulos en la base de datos
con = psycopg2.connect(CHATBOT_CONNECTION_STRING)
cursor = con.cursor()
cursor.execute("SELECT texto FROM reglamentacion.articulos")
textos = cursor.fetchall()

# recorro los textos, hago el POS tagger de cada texto, y agrego en una lista las palabras que son sustentivos,
# adjetivos y verbos unicamente

for texto in textos:
    listaPalabras = spanish_postagger.tag(texto[0].split())
    for palabras in listaPalabras:
        for pal in palabras:
            if pal[1][0:1] == 'v':
                cursor.execute(
                    "INSERT INTO reglamentacion.palabras(id, palabra, tipo) VALUES (DEFAULT, '"
                    + pal[0] + "', 'V')")
                con.commit()
            if pal[1][0:1] == 'n':
                cursor.execute(
                    "INSERT INTO reglamentacion.palabras(id, palabra, tipo) VALUES (DEFAULT, '"
                    + pal[0] + "', 'S')")
                con.commit()
            if pal[1][0:1] == 'a':
                cursor.execute(
                    "INSERT INTO reglamentacion.palabras(id, palabra, tipo) VALUES (DEFAULT, '"
Example #45
0
class yagoScores:
    def __init__(self):
        self.cnx = pymysql.connect(user='******', database='yago', password = '******')
        self.cursor = self.cnx.cursor()
        self.query = "select * from yagoFacts where t1='%s' or t3='%s'"
        self.en_postagger = POSTagger('parser/models/english-bidirectional-distsim.tagger', 'parser/stanford-postagger.jar')
        self.stopwords = nltk.corpus.stopwords.words('english')    

        
    def parse(self,text):
        return self.en_postagger.tag(text.split())
        
    def get_underscoreWords(self,text):
        return re.findall("[a-z]+_[a-z]+", text)
    
    def findNounsSeq(self,tuples):
        self.noun = []    
        self.nouns = []
        prev = ""
        for each in tuples:
            if(each[1]=="NN"):
                self.noun.append(each[0])
            if(each[1]=="NNS"):
                self.nouns.append(prev+" "+each[0])
                prev = prev+" "+each[0]
            else:
                prev = each[0]
    def changeToYagoFormat(self, g):
        g=g.strip()
        g=g.replace("'","")
        char = [c for c in g]
        char[0] = char[0].upper()
        prev = False 
        for i in range(0,len(g)):
            if(prev == True):
                char[i] = char[i].upper()
                prev = False;
            if(char[i]=="_"):
                prev = True;
        return "<"+"".join(char)+">"
    
    def getFacts(self, g):
        facts = []
        #print self.query%(g,g)
        self.cursor.execute(self.query%(g,g))
        for each in self.cursor:
            #each = each.replace("<","")
            #each = each.replace(">","")
            facts.append([each[1],each[2],each[3]])
        return facts
    def generateFeatures(self,tuples,facts):
        t2 = 0
        t1_t3 = 0
        for f in facts:
            #print f
            f = str(f[0].decode('ascii', 'ignore'))+str(f[1].decode('ascii', 'ignore'))+str(f[2].decode('ascii', 'ignore'))
            f = f.lower()
            f = f.split(">")
            f = f[:3]
            for i in range(0,3):
                f[i] = f[i].replace("_"," ")
                f[i] = f[i].replace("<","")
                f[i] = f[i].replace(")","")
                f[i] = f[i].replace("(","")
                f[i] = f[i].replace("-"," ")                                           
            #print f
            #print tuples.split()
            for each in tuples.split():
                if(each not in self.stopwords):
                    each = str(each).lower() 
                    each = each.replace("_"," ")
                    if(len(each)>2 and each in f[1]):
                        print (each+"---"+f[1])
                        t2 += 1
                    #print each
                    #print (f[0].split())
                    if(len(each)>2 and (each in f[0].split() or each in f[2].split())):
                        print (each+"------------"+f[0]+"----"+f[2])
                        
                        t1_t3 += 1    
        # TODO Now returns only total similarities
        #print verbs
        """
        bucket = 0
        if(verbs == 0):
            bucket = 0
        elif(verbs > 1 and verbs < 6):
            bucket = 1
        elif(verbs >= 6 and verbs < 11):
            bucket = 2
        elif(verbs >= 11 and verbs < 16):
            bucket = 3
        elif(verbs >= 16 and verbs < 21):
            bucket = 4
        elif(verbs >= 21 and verbs < 26):
            bucket = 5
        elif(verbs >= 26 and verbs < 31):
            bucket = 6
        elif(verbs >= 31 and verbs < 40):
            bucket = 7
        else:
            bucket = 8    
        return bucket;
        """
        return [t2, t1_t3];
    def searchInYago(self,text,guess):
        eachGuess = self.changeToYagoFormat(guess)
        facts = self.getFacts(eachGuess)
        #print facts
        count = self.generateFeatures(text,facts)
        return count
    # Call getScore(self,text,guessess)function from outside, returns dict of scores of wiki appearances
    # input, (text, list of guesses)
    def getScore(self,text,guessess):
        #print (text+guessess)
        self.freq = defaultdict(int)
        #print ("IN GUESSS")
        #tuples = self.parse(text)
        #print tuples
        #self.findNounsSeq(tuples)
        #return self.searchMultipleInYago(tuples,guessess)
        return self.searchInYago(text,guessess.strip())
>>>from nltk import word_tokenize
>>>s="I was watching TV"
>>>print nltk.pos_tag(word_tokenize(s))

# all nouns

>>>tagged=nltk.pos_tag(word_tokenize(s))
>>>allnoun=[word for word,pos in tagged if pos in ['NN','NNP'] ]

# Stanford POS tagger 

>>>from nltk.tag.stanford import POSTagger
>>>import nltk
>>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar')
>>>tokens =nltk.word_tokenize(s)
>>>stan_tagger.tag(tokens)

# POS tags freq distribtuion
>>>from nltk.corpus import brown
>>>import nltk
>>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
>>>print nltk.FreqDist(tags)

# default tagger
>>>brown_tagged_sents = brown.tagged_sents(categories='news')
>>>default_tagger = nltk.DefaultTagger('NN')
>>>print default_tagger.evaluate(brown_tagged_sents)

# N-gram taggers

>>>from nltk.tag import UnigramTagger
Example #47
0
from nltk.tag.stanford import POSTagger

postagger = POSTagger(
    "./stanford-postagger-full-2014-10-26/models/english-bidirectional-distsim.tagger",
    "./stanford-postagger-full-2014-10-26/stanford-postagger.jar")
print postagger.tag('What is the airspeed of an unladen swallow ?'.split())
import nltk
from nltk.tag.stanford import POSTagger
from nltk.tag.stanford import POSTagger
st = POSTagger('stanford-postagger-2014-01-04/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-01-04/stanford-postagger.jar')
sentence='Himanshu Bindal is a genius?'
taggedSentence= st.tag(nltk.word_tokenize(sentence))
print taggedSentence
Example #49
0
import urllib
import nltk
from bs4 import BeautifulSoup
from html2text import html2text
import re

#response = urllib.request.urlopen('http://python.org')
#html =response.read()
#数据清理
#soup = BeautifulSoup(html,'html.parser')
#clean =soup.get_text()
#print (clean)

#词性
from nltk import word_tokenize
s =" I was a teacher, I am watching TV"
print (nltk.pos_tag(word_tokenize(s)))

from nltk.tag.stanford import POSTagger
stan_tagger = POSTagger('/Users/wang/dev/stanford-postagger/models/english-bidirectional-distdim.tagger','/Users/wang/dev/stanford-postagger/stanford-postagger-3.9.1.jar')
tokens = word_tokenize(s)
print ('Stanord Tagger')
print (stan_tagger.tag(tokens))

#命名实体识别 NER
from nltk import ne_chunk
Sent =" Here is Stallman, he was working at HSBC Co. LTD before "
print (ne_chunk(nltk.pos_tag(word_tokenize(Sent)),binary=False))
Example #50
0
def pos_stanford(tokens):

    tagger = POSTagger('./english-bidirectional-distsim.tagger',
                       './stanford-postagger.jar')
    return tagger.tag(tokens)
]
tokens = [t for t in tokens if t not in non_tech_words and t.isalpha()]
print tokens
# Part of Speech Tagging
tags = []
starti = 0
endi = 0
no_chunks = len(tokens) / 5000
print 'Process ' + str(
    len(tokens)) + ' tokens in ' + str(no_chunks) + ' chunks..'
for l in range(0, no_chunks):
    endi = min((starti + (len(tokens) / no_chunks)), len(tokens))
    print "Tagging #" + str(l) + ": from " + str(starti) + " to " + str(endi -
                                                                        1)
    #tags = tags + nltk.pos_tag(tokens[starti:endi]);
    tags = tags + pos.tag(tokens[starti:endi])[0]
    starti = endi

print "\n" + str(len(tags)) + " words tagged.."

# Save all the Noun and Adjective unigrams in a hash table
Tag_set = {'Word': 'Tag'}
for tag in tags:
    if (cleanseNN([str(tag[1])]) in patterns[0:2]):
        Tag_set[str(tag[0])] = str(tag[1])
    #print cleanseNN([str(tag[1])])
#print tags
#print '\n'
print Tag_set

# Look for longest n-gram appearing in each sentence with the patterns of technical terms
Example #52
0
from nltk import *
from nltk.tag.stanford import POSTagger
PATH_TO_TAGGER=r'C:\\AptanaWorkspace\\Thesis\\src\\lib\\english-left3words-distsim.tagger'
PATH_TO_JAR=r'C:\\AptanaWorkspace\\Thesis\\src\\lib\\stanford-postagger.jar'
st = POSTagger(PATH_TO_TAGGER,PATH_TO_JAR)
s1="Where is the nearest city to Columbus?"
s2="Where is the nearest city to Ohio State University?"
tagged_question=st.tag(word_tokenize(s2))
s= corpus.treebank.tagged_sents()[22]
print s
print tagged_question
print ne_chunk(tagged_question)
Example #53
0
class PersianPipeline:
    def __init__(self, posTagModelPath, posTaggerPath, parserModelPath,
                 workingDir):

        try:
            self.posTagger = POSTagger(posTagModelPath, posTaggerPath, "UTF-8")
            print "pos tagger is loaded"
        except:
            print "Error in loading POS tagger"

        try:
            self.parser = MaltParser(tagger=None,
                                     mco=parserModelPath,
                                     working_dir=workingDir)
            print "parser is loaded"
        except:
            print "Error in loading the MALT Parser"

    # tokenizes, fixes some of the detached affixes
    def preprocess(self, s):
        # remove the diacritics
        drs = s
        for c in range(1611, 1619):
            drs = drs.replace(unichr(c), "")

        # tokenize the sentence
        ts = self.seperatePuncs(drs)
        # fix the affixes
        afs = self.fixAffixes(ts)

        # replace slashes and pounds and underlines
        afs = afs.replace("#", "-")
        afs = afs.replace("/", "-")
        afs = afs.replace("_", "-")
        return afs

    # tokenize a persian sentence
    def seperatePuncs(self, s):

        s = re.sub(ur"([\[{\(\\`\"‚„†‡‹‘’“”•\.–—›««])", r"\1 ", s)
        s = re.sub(ur"([\]}\'\`\"\),;:!\?\%‚„…†‡‰‹‘’“”•–—›»\.])", r" \1", s)
        # persian specific
        s = re.sub(ur"([،؛؟،\.])", r" \1 ", s)
        s = s.replace("  ", " ")
        return s

    def fixAffixes(self, sent):
        suffList = [u"ها", u"های"]
        sSent = sent.split(" ")
        newTokSent = []
        sentLen = len(sSent)
        i = 0
        while i < sentLen:

            if sSent[i] in suffList:
                print "+++ Affix problem got fixed"
                # attach the suffix to the previous word
                newTokSent[-1] = newTokSent[-1] + u"\u200c" + sSent[i]
            else:
                newTokSent.append(sSent[i])
            i += 1
        return " ".join(newTokSent)

    def posTagASentence(self, sent):
        try:
            sent = sent.replace("/", "-")
            posSent = self.posTagger.tag(sent.split())
            return posSent
        except:
            return None

    # Function reads in a POS tagged sentence (list) and if there are two adjacent verbs, it attaches them together and make them one word.
    def attachPerCompounds(self, posSent):

        prFlag = False
        ct = senCt = prCt = 0
        i = 0
        senCt += 1
        pos = wd = outWd = ""
        sentLen = len(posSent)
        newPOSSent = []
        while i < sentLen - 1:
            ct += 1
            tok = posSent[i]
            nexTok = posSent[i + 1]
            (wd, pos) = tok
            (nwd, npos) = nexTok
            outWd = wd
            if pos == "V":
                if npos == "V":
                    prFlag = True
                    outWd = wd + '^' + nwd
                    pos = "V"
                    i += 1
            # attaching the "mi" prefix for present continious form
            if npos == "V" and wd.strip() == u"می":
                prFlag = True
                outWd = u"می" + u"\u200c" + nwd
                pos = "V"
                i += 1
                #print "the mi case "
                #t.write("outWd:" + outWd + "\n")

            newPOSSent.append((outWd, pos))
            i += 1

        # don't forget the last word (if not processed)
        if i < sentLen:
            ct += 1
            tok = posSent[-1]
            newPOSSent.append(tok)

        # counting the lines with compound verbs patterns
        if prFlag:
            prCt += 1
        #print prCt

        #t.write(newPOSSent[-2][0] + "--" + newPOSSent[-1][0] + "\n")
        return newPOSSent


################################################################

    def parseATaggedSentence(self, tSent):
        try:
            compTSent = self.attachPerCompounds(tSent)
            depParse = self.parser.tagged_parse(compTSent)
            return depParse
        except:
            print "Error in parsing a sentence!"
            return None

    def parseASentence(self, sent):
        pass
Example #54
0
parsed = open('combine_parsed', 'r').read()
reviews = parsed.split('> (')

processed_reviews = []
pos_sentence = {}

for review in reviews:
    if review != '':
        review = review.strip()[:-1]
        review = review.split('\n')[:-1]

        processed_items = []
        for item in review:

            item = item.split('\t')
            item[1] = item[1][1:]
            item[-1] = item[-1][:-1]
            item[2] = item[2].split(' ')

            processed_items.append(item)

        processed_reviews.append(processed_items)

for ind, review in enumerate(processed_reviews):
    tokens = []
    for item in review:
        if item[0][0] != 'E':
            tokens.append(item[1])
    pos_sentence[ind] = english_postagger.tag(tokens)

postagger = cPickle.dump(pos_sentence, open('pos_combine', 'wb'))
Example #55
0
conn = connect("dbname=Ohio user=postgres password=ohiostate")
cur = conn.cursor()
# tag new sentences and update the database
question_type_id=None
with open("new_question",'r') as fr:
    while(1):
        line=fr.readline()
        line=line.strip()
        if line=='':    # EOF
            break
        if line[0]=='#':
            question_type_id=int(line.split(' ')[1])
            print question_type_id,line
            count=0
        else:
            tagged_question=tagger.tag(word_tokenize(line))
            tag=' '.join([t for w,t in tagged_question])
            tagged=' '.join([w+'/'+t for w,t in tagged_question])
            try:
                sql = """INSERT INTO template (tag, question_type_id) values ('%s',%d)""" % \
                    (tag,question_type_id)
                print sql
                cur.execute(sql)    
                conn.commit()
            except Exception,e:
                print str(e)
                conn.rollback()
            try:
                sql = """INSERT INTO question (sentence,tagged,tag,question_type_id) values ('%s','%s','%s',%d)""" % \
                    (line.replace("'", "''"),tagged.replace("'", "''"),tag,question_type_id)
                print sql
Example #56
0
def pos_stanford(tokens):

    tagger = POSTagger('./english-bidirectional-distsim.tagger',
                       './stanford-postagger.jar')
    return tagger.tag(tokens)
Example #57
0
from nltk.tag.stanford import POSTagger
from sidd.paraphraser.Paraphraser import *
import nltk
import os
# lm=ARPALanguageModel('../../jars/lm_csr_5k_vp_2gram.arpa',encoding='utf-8')
#
# print lm.score('Hello how are you')
os.environ[
    'JAVA_HOME'] = 'C:/Program Files/Java/jdk1.7.0_17/bin'  ##Lab desktop
from sidd.paraphraser import PPDBLoader
english_postagger = POSTagger('../../jars/english-left3words-distsim.tagger',
                              '../../jars/stanford-postagger.jar',
                              encoding='utf-8')
syntacticMap = PPDBLoader.createSyntacticParaphraseMap('s', '../../ppdb')

#sentence='An Emirates and an Etihad aircraft, flying in opposite directions came in proximity of each other over the Indian Ocean, leading to a collision alert warning in the two cockpits on Sunday night.'
#orig_Sentence=sentence.decode('utf-8', errors='replace')
sentence = 'The box is thrown.'
orig_Sentence = sentence
sentence = english_postagger.tag(nltk.word_tokenize(sentence))
print sentence[0][0]
print sentence
modSentence = sentenceTuple(sentence)

ppCandidateList = generateNGramCandidatesToChange(modSentence, MAX_NGRAMS=4)
#all_possible_transformations=generateListOfPossibleTransformations(ppCandidateList,
#                                                                        LexicalPPDict=LexicalPPDict,
#                                                                        PhrasalPPDict=PhrasalPPDict,
#                                                                        stopwords=stopwordList,
#                                                                        useIdentities=False)