Esempio n. 1
def tagged_def():
    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path
    tagger = StanfordPOSTagger(

    path_data = "data" + os.sep + "items_tagged_modified.json"
    data = json.load(, encoding='UTF-8'))
    for item in data:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            definition = pos2def["definition"]
            #             print chardet.detect(definition)
            print definition.encode('gbk')
            definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
            tokens = nltk.word_tokenize(definition_pure)
            #             print tokens
            for token in tokens:
                print chardet.detect(token)
            tagged_tokens = tagger.tag(definition_pure.encode('utf-8').split())
            pos2def['tagged_def'] = tagged_tokens

    path_tagged_output = "items_tagged_auto.json"
    , 'w', 'utf-8'),
Esempio n. 2
def getUsername(message, *args):
    pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8")
    words = nltk.word_tokenize(message.lower())
    tagged_words = pos_tagger.tag(words)
    sug_usernames = []
    # Check if pervious username input is passed
    if len(args) > 0:
        previous_username = args[0]
        sug_usernames = [
            word for word, tag in tagged_words
            if tag in ['NN', 'NNP', 'FW', 'NNPS'] and word != previous_username
        sug_usernames = [
            word for word, tag in tagged_words
            if tag in ['NN', 'NNP', 'FW', 'NNPS']

    if len(sug_usernames) > 0:
        if getSentenceSentiment(message) == 'pos':
            return sug_usernames[-1]
            return sug_usernames[
                -1] + 'salt123'  # return last suggested username

    return 'randomuser567user'
Esempio n. 3
def extractor():
    st = StanfordPOSTagger(
    nouns = []
    pnouns = []
    i = 0

    with open('../data/scraped_text_NYT.txt', 'r',
              encoding='utf-8') as inputFile:
        comment = inputFile.readline()
        while comment != "":
            sentences = sent_tokenize(comment, 'english')

            for sent in sentences:
                if (sent.strip() == ""):
                pos_tags = st.tag(sent.split())
                for pos_tag in pos_tags:
                    if (pos_tag[1] == 'NN' or pos_tag[1] == 'NNS'):
                        nouns = nouns + [pos_tag[0]]
                    elif (pos_tag[1] == 'NNP' or pos_tag[1] == 'NNPS'):
                        pnouns = pnouns + [pos_tag[0]]
            i = i + 1
            comment = inputFile.readline()

    outFile = open('../data/nouns_scraped_text_NYT.txt', 'a')
    for noun in nouns:
        outFile.write(noun + "\n")
    for pnoun in pnouns:
        outFile.write(pnoun + '\n')
    def __init__(self):

        # stanford ner tagger
        from nltk.tag.stanford import StanfordNERTagger
        self.ner_stanford = StanfordNERTagger(

        # stanford pos tagger
        from nltk.tag.stanford import StanfordPOSTagger
        self.pos_stanford = StanfordPOSTagger(

        # spacy ner tagger
        import spacy
        self.ner_spacy = spacy.load('en')

        # wordnet lemmatizer
        from nltk.stem.wordnet import WordNetLemmatizer
        self.lemmatizer = WordNetLemmatizer()

        self.tagged_output = {}
Esempio n. 5
def test_StanfordAndNLTKPOS():
    import nltk
    from nltk.tag.stanford import StanfordPOSTagger
    sent = 'a low-calorie sweetener that reduces caries activity and the growth and transmission of S. mutans.'
    sent = 'a wire formed by drawing a cast structure through a die; used in dentistry for partial denture clasps and orthodontic appliances.'
    sent = 'readily stained with acid dyes.'
    print chardet.detect(sent)
    #     sent='technique metered spray refers to a topical anesthetic dispersal technique that controls the amount and rate at which a drug is administered.'
    #     sent='older term for a traumatic ulcer of the oral mucosa.'
    #     sent='one or more vertically parallel surfaces of abutment teeth shaped to direct the path of placement and removal of a remarkable partial denture. Also called guiding plane.'
    #     sent='agents that bond, seal, or cement particles or objects together.'
    #     sent='teeth that are at such an angle as to cause them to be out of centric contact with opposing teeth during occlusion.'
    start =
    text = nltk.word_tokenize(sent)
    nltk_pos = nltk.pos_tag(text)

    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path
    stanford_tagger = StanfordPOSTagger(
    stanford_pos = stanford_tagger.tag(text)
    print 'nltk_pos: ' + str(nltk_pos)
    print 'stanford_pos: ' + str(stanford_pos)
Esempio n. 6
 def tagWordsInSentences(self, studying, entry):
     '''Tags the part of speech for each word.'''
     jar_path = 'stanford-postagger-full/stanford-postagger.jar'
     if studying in self.english:
         words = parseWordsFromEntry(entry)
         tagged_words = tagWords(words)
         return tagged_words
     elif studying in self.japanese or self.korean or self.mandarin:
         #segmenter = TinySegmenter()
         #words = segmenter.tokenize(entry)
         rm = RakutenMA()
         tagged_words = rm.tokenize(entry)
         #mecab = Mecab()
         #tagged_words = mecab.pos(entry)
         return tagged_words
         if studying in self.spanish:
             model_path = 'stanford-postagger-full/models/spanish.tagger'
             words = parseWordsFromEntry(entry)
         elif studying in self.french:
             model_path = 'stanford-postagger-full/models/french.tagger'
             words = parseWordsFromEntry(entry)
         postagger = StanfordPOSTagger(model_path,
         tagged_words = postagger.tag(words)
         return tagged_words
Esempio n. 7
def clean_words(tokens, filterStopwords=False, filterPos=None):
	cleanTokens = []
	stopwordList = stopwords.words('spanish')
	if filterPos:
		tagger = StanfordPOSTagger('stanford/models/spanish.tagger', 'stanford/stanford-postagger.jar', encoding='utf8')

	for token in tokens:
		cleanToken = token
		for char in string.punctuation:
			cleanToken = cleanToken.replace(char, "")
		if filterPos and not filterStopwords:
			res = tagger.tag([cleanToken])
			if len(res)>0:
				word, pos = res[0]
				if pos[0] in filterPos:
		elif filterStopwords and not filterPos:
			if cleanToken not in stopwordList:
		elif filterStopwords and filterPos:
			res = tagger.tag([cleanToken])
			if len(res)>0:
				word, pos = res[0]
				if cleanToken not in stopwordList and pos[0] in filterPos:

		elif not filterStopwords and not filterPos:
	return cleanTokens
Esempio n. 8
def posTagging():
    #myNounPhrases = []
    myCompletePOSStructure = []
    a = ['NNP', 'NNPS'] #Avoid NN,NNS. Only NNP , NNPS for purpose of NER.
    print '######## POS'
    english_postagger = StanfordPOSTagger(
    #abc = english_postagger.tag('Steve Jobs was Founder of Apple. He was born in United States of America'.split())
    abc = english_postagger.tag('Who was the CEO of IBM'.split())
    print abc
    for number in abc:
        #print number[0],number[1]
        someTup = (number[0].encode('utf8'),number[1].encode('utf8'))
        #print someTup

        #print split1[0] + ' ' + split1[1]
        #print unicodedata.normalize('NFKD', split1[0]).encode('ascii','ignore')
        #print unicodedata.normalize('NFKD', split1[1]).encode('ascii', 'ignore')

    print myCompletePOSStructure

    for number in abc:
        if any(x in number for x in a):
            #print number
            split1 = str(number).split(',')
            split2 = str(split1[0]).split('u')
            # print split2[1].replace("'", "")
Esempio n. 9
    def __init__(self, translation_id):
        # Specify paths to Stanford taggers
        english_modelfile = '{}/models/english-bidirectional-distsim.tagger'.format(STANFORD_POS_TAGGER_LOCATION)
        spanish_modelfile = '{}/models/spanish-distsim.tagger'.format(STANFORD_POS_TAGGER_LOCATION)
        jarfile = '{}/stanford-postagger-3.7.0.jar'.format(STANFORD_POS_TAGGER_LOCATION)

        # Set Translation ID
        self.translation_id = translation_id

        # Initialize taggers
        self.en_tagger = StanfordPOSTagger(model_filename=english_modelfile, path_to_jar=jarfile)
        self.es_tagger = StanfordPOSTagger(model_filename=spanish_modelfile, path_to_jar=jarfile)

        # Store the string literals from the VA3 files
        self.va3l1 = []
        self.va3l2 = []

        # Store tokenized plaintext sentences
        self.l1_tok_sent = []
        self.l2_tok_sent = []

        # Store the alignments as lists of lists of ints
        self.l1_alignments = []
        self.l2_alignments = []

        # Stor the POS tags as lists of strings
        self.l1_pos_tags = []
        self.l2_pos_tags = []
Esempio n. 10
def pos_tagging(docs, stanford_path, pos_tagger):
    print("\nGenerating Part-of-Speech tags...")

    # Configuring Stanford NLP POS tagger
    path_to_model = "{}/models/{}.tagger".format(stanford_path, pos_tagger)
    path_to_jar = "{}/stanford-postagger.jar".format(stanford_path)

    tagger = StanfordPOSTagger(model_filename=path_to_model,
    # Setting higher memory limit for long sentences
    tagger.java_options = '-mx8192m'

    data = []
    for doc in progressbar.progressbar(docs):
        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

            # Perform POS tagging
            tagged = tagger.tag(tokens)

        # Take the word, POS tag, and its label
        data.append([(w, pos, label)
                     for (w, label), (word, pos) in zip(doc, tagged)])
    return data
Esempio n. 11
def get_tagger():
    ''' Set up & return the Stanford Tagger object.'''
    path_to_model = "/home/avery/Applications/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger"
    path_to_jar = "/home/avery/Applications/stanford-postagger-2018-02-27/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = "-mx8192m"
    # Use: tagger.tag(word_tokenize(string))
    return tagger
Esempio n. 12
 def __init__(self):
     Initializes the tagger object
     self.model = TAGGER_MODEL
     self.jar_file = POS_TAGGER_JAR_FILE
     self.tagger = StanfordPOSTagger(self.model, self.jar_file)
     self.tagger_type = STANFORD_TAGGER_NAME
Esempio n. 13
    def part_of_speech_tagging(
            self, words: List[str],
            multi_word_name_entities: Set[str]) -> List[Tuple[str, str]]:
        perform part-of-speech tagging using StanfordPOSTagger
        :param words: a list of words in a sentence
        :param multi_word_name_entities: a set of multi-word name entities
        :return: part-of-speech tag of the sentence
        # define pos tagger
        path_to_model = 'stanford/pos/english-bidirectional-distsim.tagger'
        path_to_jar = 'stanford/pos/stanford-postagger.jar'
        pos_tagger = StanfordPOSTagger(path_to_model, path_to_jar)

        stan_pos_tag = pos_tagger.tag(words[:-1])  # omit the last period
        normal_pos_tag = nltk.pos_tag(words[:-1])  # omit the last period

        # print('Stanford POS tagging:', stan_pos_tag)        # for comparison
        # print('nltk.pos_tag tagging:', normal_pos_tag)      # for comparison

        def post_treatment(stan_pos_tag: List[Tuple[str, str]],
                           norm_pos_tag: List[Tuple[str, str]],
                           multi_word_name_entities: Set[str]) -> None:
            combine the multi-word name entities
            since nltk.pos_tag label multi-word name entities together, so I correct stan_pos_tag by using norm_pos_tag
            the problem of norm_pos_tag is that it usually mislabels words, and that's why I prefer to use StanfordPOStagger
            :param stan_pos_tag: a list of pos-tags of sentences using stanford pos tagger
            :param norm_pos_tag: a list of pos-tags of sentences using nltk.pos_tag
            stan_len = len(stan_pos_tag)
            norm_len = len(normal_pos_tag)
            stan_i = 0
            norm_i = 0
            while stan_i < stan_len and norm_i < norm_len:
                stan_word, stan_pos = stan_pos_tag[stan_i]
                norm_word, norm_pos = norm_pos_tag[norm_i]
                # check if word exists in multi_word_name_entities
                if stan_word == norm_word.split(
                        ' ')[0] and norm_word in multi_word_name_entities:
                    # scan the following words in stan_pos_tag and combine if they can form a multi-word entity
                    temp_i = stan_i + 1
                    match_idx = 1
                    entities = norm_word.split(' ')
                    while temp_i < stan_len and match_idx < len(entities):
                        temp_word, temp_pos = stan_pos_tag[temp_i]
                        if temp_word == entities[match_idx]:
                            _ = stan_pos_tag.pop(temp_i)
                            match_idx += 1
                    stan_pos_tag[stan_i] = (norm_word, stan_pos)
                stan_i += 1
                norm_i += 1

        post_treatment(stan_pos_tag, normal_pos_tag, multi_word_name_entities)

        return stan_pos_tag
Esempio n. 14
class POSTagger:
    """POSTagger creates a POS tagger for german language. Different tagger are available to use."""
    STAN = "stanford-hgc-tagger"
    SFT = "stanford-fast-tagger"
    TT = "tree-tagger"
    SPACY = "spacy-tagger"

    # paths to Stanford tagger modules
    __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar"
    __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/"

    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
            raise Exception("Wrong tagger parameter.")

    def tag(self, text):
        """POS tag tokenized text."""
        if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN:
            tokens = self.__tokenizer.tokenize(text)
            return self.__tagger.tag(tokens)
        elif self.tagger_name == POSTagger.TT:
            tags = self.__tagger.tag_text(text)
            tuple_list = []
            tag_list = treetaggerwrapper.make_tags(tags)
            for item in tag_list:
                tuple_list.append((item[0], item[1]))
            return tuple_list
        elif self.tagger_name == POSTagger.SPACY:
            tags = self.__tagger(text)
            tuple_list = []
            for word in tags:
                tuple_list.append((word.orth_, word.tag_))
            return tuple_list

#tagger = POSTagger("spacy-tagger")
#doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.")
#print(tagger.tag("Ich werde morgen in die Schule gehen."))
#print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
Esempio n. 15
def pos_tag(to_tag, stanford_postagger_path):
    '''tag the tokens with part of speech; to_tag is the tags; model_path is the 
    file path to the stanford POS tagger model; and jar_path to the Stanford POS 
    tagger jar file'''
    pos_tagger = StanfordPOSTagger(stanford_postagger_path +"\\models\\french.tagger",
                                   stanford_postagger_path +"\\stanford-postagger.jar",
                                   encoding='utf8') #create an object of class POSTagger that is encoded in UTF-8
    tags = pos_tagger.tag(to_tag) #run the tagging algorithm on the tokenized raw text
    return tags
Esempio n. 16
def getTags(sen_arr):
    tag_arr = []
    st = StanfordPOSTagger('english-left3words-distsim.tagger')
    res = st.tag(sen_arr)
    for i in res:
        tag = i[1].encode("utf-8")

    return tag_arr
Esempio n. 17
    def determine_sentpos_by_nltk(self, sentence):
			get pos collection for sentence from nltk
        pos_model_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/models/english-bidirectional-distsim.tagger"
        pos_jar_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/stanford-postagger.jar"
        pos = StanfordPOSTagger(model_filename=pos_model_file,
        return pos.tag(sentence.split(" "))
def pos_tagger(text):
    from nltk.tag.stanford import StanfordPOSTagger
    english_postagger = StanfordPOSTagger(
    english_postagger.java_options = '-mx4096m'
    tags = english_postagger.tag(text)
    return tags
Esempio n. 19
def posInput(text):
	path_to_model = "./stanford-postagger/models/english-caseless-left3words-distsim.tagger"
	path_to_jar = "./stanford-postagger/stanford-postagger.jar"
	tagger=StanfordPOSTagger(path_to_model, path_to_jar)
	tagger.java_options='-mx4096m'          ### Setting higher memory limit for long sentences
	# sentence = 'THIS IS TESTING'
	result = tagger.tag(word_tokenize(text))
	# print result
	return result
Esempio n. 20
    def tagger(self):
        if self.taggerUse == 'standford':
            tagger = StanfordPOSTagger('/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/models/spanish-distsim.tagger',
            tagged_sents = tagger.tag_sents(self.clean_corpus)
            tagged_sents = self.nlp.pipe(self.clean_corpus, n_threads=8)
            tagged_sents = self.proccess_spacy(tagged_sents)

        return self.dig2num(tagged_sents)
Esempio n. 21
def transform_wnli(premise,hypothesis):
    premise=[w.lower() for w in nltk.word_tokenize(premise)]

    #transform WNLI examples back into WSC format
    hypothesis = [w.lower() for w in nltk.word_tokenize(hypothesis)]
    best_target=["","","","","",""]#should get overwritten
    for l in range(len(hypothesis)):
        for r in range(l+1,l+6):
            left_part = hypothesis[:l]
            right_part = hypothesis[r:]
            pattern = left_part + ["_"]+ right_part
            for s in range(len(premise)):
                if s+len(pattern)>len(premise):
                for a,b in zip(pattern,premise[s:s+len(pattern)]):
                    if a=="_":
                    if a==b:
                    if a in [',','.','?','!'] and b in [',','.','?','!']:#punctuation is ignored
                if ok and len(hypothesis[l:r])<=len(best_target):
                    best_target = hypothesis[l:r]
                    best_masked_s = premise[:s]+pattern+premise[s+len(pattern):]
    if len(best_masked_s)==0:#We failed
        return None,None
    #We extracted the masked sentence from the premise.
    global POS_tagger
    if POS_tagger is None:
        os.environ['STANFORD_MODELS'] = "stanford-postagger-2018-10-16/models"
        os.environ['CLASSPATH'] = "stanford-postagger-2018-10-16"
        POS_tagger = StanfordPOSTagger("stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger")
    tagged_premise = POS_tagger.tag(nltk.word_tokenize(cased_premise))
    candidates = []
    for word,tag in tagged_premise:
        if tag in ["NN","NNS","NNP","NNPS"]:
            if current!=[]:
                candidates.append(" ".join(current).lower())
    if current!=[]:
        candidates.append(" ".join(current).lower())
    best_target=" ".join(best_target)
    candidates=[c for c in candidates if c.find(best_target)==-1 and best_target.find(c)==-1]
    candidates = [best_target]+candidates
    found_sentence = " ".join(best_masked_s).replace(" n't","n't").replace(" 's","'s")#Sorry nltk
    return found_sentence,candidates
Esempio n. 22
 def tag(tokens):
     #java_path = "C:/Program Files/Java/jdk1.8.0_31/bin/java.exe"
     #os.environ['JAVAHOME'] = java_path
     special_symbols_array = ["the", "a", "an"]
     english_postagger = StanfordPOSTagger(
     token_tag_array = english_postagger.tag(tokens)
     for element in token_tag_array:
         if element[0].lower() in special_symbols_array:
     return token_tag_array
	def __init__(self):
		# user need to download Stanford Parser, NER and POS tagger from stanford website
		self.constituent_parse_tree = StanfordParser()  #user need to set as environment variable
		self.stanford_dependency = StanfordDependencyParser() #user need to set as environment variable
		self.lemma = WordNetLemmatizer()
		self.home = '/home/ramesh'
		#user needs to download stanford packages and change directory
		self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
		self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
		self.CharacterOffsetEnd = 0 
		self.CharacterOffsetBegin = 0
Esempio n. 24
	def __init__(self):
		# print "Inside ntlk util"
		self.constituent_parse_tree = StanfordParser()
		self.stanford_dependency = StanfordDependencyParser()
		self.lemma = WordNetLemmatizer()
		self.home = '/home/ramesh/Documents/mas_course/second_semester/rnd/rnd_submission_cd'
		self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
		self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
		self.CharacterOffsetEnd = 0 
		self.CharacterOffsetBegin = 0
		self.contractions = {"'nt":"not", "'ll": " will", "'re":"are", "'ve":"have", "'m":"am"}
Esempio n. 25
    def create_pos(self, tweet):
        self.pos_tweet = None

        tweet = word_tokenize(tweet.lower())

        english_pos = StanfordPOSTagger(

        self.pos_tweet = english_pos.tag(tweet)

        return self.pos_tweet
Esempio n. 26
	def __init__(self, posTagger, filePath, saveExtension, loadExtension, plainExtension, contextLength, replacementFrame, replacementFrameExtensions, replacementCount):
		self.posTagger_Model = posTagger + "/models/english-bidirectional-distsim.tagger"
		self.posTagger_Jar = posTagger + "/stanford-postagger.jar"
		self.filePath = filePath
		self.savePath = filePath + saveExtension
		self.loadPath = filePath + loadExtension
		self.plainPath = filePath + plainExtension
		self.contextLength = contextLength
		self.replacementFrame = replacementFrame
		self.replacementFrameExtensions = replacementFrameExtensions
		self.replacementCount = replacementCount
		self.allTokens = []
		self.tagger = StanfordPOSTagger(self.posTagger_Model, self.posTagger_Jar, java_options='-mx1000m -Xmx1028m')
Esempio n. 27
    def set_pos_tagger(self, model_path=POS_MODEL, jar_path=POS_JAR):
        """Setup path for Standford POS tagger. Default value is configured in

            model_path: path to the trained model
            jar_path: path to the JAR
        model_path = os.path.join(self.lib_path, model_path)
        jar_path = os.path.join(self.lib_path, jar_path)
        if os.path.isfile(model_path) and os.path.isfile(jar_path):
            self.pos_tagger = StanfordPOSTagger(model_path, jar_path)
            raise IOError('Cannot find POS tagging lib')
Esempio n. 28
 def _POS(self, txt, id):
     self.df[['ID', 'pos']].to_csv('pos_ner.csv', sep='\t')
     path_pos = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
     model_path = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger'
     from nltk.tag.stanford import StanfordPOSTagger
     tagger = StanfordPOSTagger(model_path, path_pos)
     tagger.java_options = '-mx8096m'  ### Setting higher memory limit for long sentences
     tokens = nltk.word_tokenize(txt)
     pos_res = tagger.tag(tokens)
     filepath = '/home/ise/NLP/NLP3/pos/pos_{}.txt'.format(id)
     with open(filepath, 'w') as file_handler:
         for item in pos_res:
     return pos_res
Esempio n. 29
def extractPOS(inputFile_data, inputFile_tags, inputFile_version,
    f = open(inputFile_tags)
    allTags = set(","))  # Load all tags

    f = open(inputFile_version)
    lines = f.readlines()
    tag_version = []  # tags with version number
    for index, row in enumerate(lines):
        items = row.strip().split()
        if items[0] in allTags:
            for tag in items[1].split(","):

    print "The number of tag_version is: ", len(tag_version)
    tag_version = set(tag_version)

    fw_pos = open(outputFile_pos, "w")
    english_postagger = StanfordPOSTagger(
    f = open(inputFile_data)
    lines = f.readlines()

    for index, row in enumerate(lines):
        if index % 300 == 0:
            print index, " Finish ", float(index) / len(lines)
        items = row.strip().split("		")

        # if index >=5000 and index < 6000 and items[0] in tag_version:
        if items[0] in tag_version:
            fw_pos.write(str(index) + "	" + items[0] + "	\n")
        if items[0] not in tag_version:

            fw_pos.write(str(index) + "	" + items[0] + "	")
            if len(items) > 1:
                text = items[1].split(". ")[0].decode('utf-8')
                pos = english_postagger.tag(text.split())

                for p in pos:
                    fw_pos.write("	")


Esempio n. 30
def transform_to_pos(text):
    import os
    #os.environ['JAVAHOME'] = java_path
    from nltk.corpus import sentiwordnet as swn
    from nltk.tag.stanford import StanfordPOSTagger
    from nltk import word_tokenize

    path_to_model = "./postagging/english-bidirectional-distsim.tagger"
    path_to_jar = "./postagging/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences
    tokens = word_tokenize(text)
    size = len(tokens)
    from collections import Counter
    pos = tagger.tag(tokens)
    counts = Counter(tag for word, tag in pos)
    for key in counts:
        counts[key] /= size
    counts["totalWordsCount"] = size
    counts[";"] = tokens.count(";") / size
    counts["questionmarks"] = tokens.count("?") / size
    counts["exclamationmarks"] = tokens.count("!") / size
    counts["Quotes"] = tokens.count("\"") / size
    from collections import OrderedDict
    ot = [
        'NNP', 'VBD', 'VBN', 'IN', 'CD', 'VBP', ',', 'DT', 'NN', 'JJ', 'RB',
        'TO', 'SYM', 'PRP', 'NNS', 'CC', 'PRP$', 'POS', 'FW', 'VBG', ':',
        'WRB', 'EX', 'JJR', 'WDT', 'totalWordsCount', ';', 'questionmarks',
        'exclamationmarks', 'Quotes'
    counts = OrderedDict(counts)
    for key in ot:
        if key in counts:
            counts[key] = 0
    tmp = counts.copy()
    for key in tmp:
        if key not in ot:
            counts.pop(key, None)
    dab = {}
    for i in ot:
        dab[i] = counts[i]
    counts = dab.copy()
    return counts
Esempio n. 31
def build_question_set():
    sv_file = 'data/kprestval_pos_tags.json'
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    meta = load_and_process_metadata('val')
    images = split_data_by_seed(meta, 'kprestval')
    num = len(images)
    pos_tags_dict = {}
    for i, info in enumerate(images):
        question_id = info.question_id
        question = info.question.lower()
        _pos_tags = st.tag(word_tokenize(question))
        pos_tags_dict[question_id] = _pos_tags
        print('\nPOS TAGGER: %d/%d' % (i, num))
    save_json(sv_file, {'pos_tags': pos_tags_dict})
Esempio n. 32
 def __init__(self):
     self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger',
                                        self.modeldir + '/postagger/stanford-postagger.jar')
     self._stemmer = nltk.SnowballStemmer("english")
     self._stopwords = stopword(self.stopword_path)
     self._type_words = self._set_type_words()
     self._sentiment = self._get_sentiment()
Esempio n. 33
	def __init__(self, text):

		self.text = text

		root = os.path.dirname(os.path.realpath(__file__))
		os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar"
		os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/"
		_path_to_model  = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
		_path_to_jar    = root + '/stanford-postagger/stanford-postagger.jar'
		self.stanford   = StanfordPOSTagger(_path_to_model, _path_to_jar)

		self.sentences  = sent_tokenize(text.encode("utf-8"))
		self.words      = word_tokenize(text.encode("utf-8"))

		self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) ))
		#cleanWords = self.cleanWords()
		#self.tags = self.stringifyTuples(self.stanford.tag( cleanWords ))
		#print self.cleanWords()
		self.taggedBigrams = self.ngramsAndTags(2) 
Esempio n. 34
    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
            raise Exception("Wrong tagger parameter.")
Esempio n. 35
class String2POSNGramsList(String2TokenList):

    def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'):

        # Other Taggers:
        #   1. 'english-bidirectional-distsim.tagger'
        #   2. 'english-left3words-distsim.tagger'

        super(String2POSNGramsList, self).__init__()

        # N-Grams size
        self.n = n

        # Tagger Class Selection... See detail in Stanford Tagger documentation.
        self.tagger_cls = tagger_cls

        # Getting the Stanford tagger instance.
        self.spt = StanfordPOSTagger(self.tagger_cls)
        # self.spt = CoreNLPPOSTagger(url='http://localhost:9000')
        self.spt.java_options = '-mx10g'

    def N(self):
        return self.n

    def N(self, value):
        self.n = value

    def Tagger_cls(self):
        return self.n

    def Tagger_cls(self, value):
        self.tagger_cls = value

    def terms_lst(self, text):

        # Getting the Analysed list of tokens.
        analyzed_terms_lst = self.token_lst(text)

        # Tagging the Analyzed terms list and getting the tags list as terms.
        pos_tags = [pos for t, pos in self.spt.tag(analyzed_terms_lst)]

        # Constructing the Words N-Grams List
        analyzed_terms_lst = [
            " ".join(pos_tags[i: i+self.n])
            for i in range(len(pos_tags) - self.n + 1)

        return analyzed_terms_lst
Esempio n. 36
def get_pos_sentence(sentences_spans,pos_vocab):
    Get POS tags for each sentence. (needed to build end2end system)
    :param start:
    :param end:
    #raw_dir_simple = read.read_from_json('test/test_dir_simple')   #### in folder data/
    #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples')
    #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple')

    #raw_dir_simple = ["NYT19980206.0466"]
    english_postagger = StanfordPOSTagger(
        StandforParser,    #### in folder data/
        StandforParser_jar) #### in folder data/
    english_postagger.java_options = '-mx8000m'
    pos_sentences = list()

    for sent_span in sentences_spans:
        text = nltk.word_tokenize(sent_span[0])
        text_pos = english_postagger.tag(text)   #####StanfordPnOSTagger failed to tag the underscore, see  if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues

        index = 0
        for token in text_pos:
            # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"):  ######### deal with the double quotes, in nltk.tokenize change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
            #     text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] == "``"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "``"]
            if text[index] ==token[0] and token[0] == "''"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] in ['{','(','['] :
                text_pos[index] = [token[0],"("]
            if text[index] == token[0] and token[0] in ['}',')',']']:
                text_pos[index] = [token[0],")"]
    return pos_sentences,pos_vocab
Esempio n. 37
	def __init__(self, pos_model, stanford_tagger, java_path):
		Creates a POSTagSelector instance.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link:
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link:
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
Esempio n. 38
	def __init__(self, condprob_model, pos_model, stanford_tagger, java_path):
		Creates a POSTagSelector instance.
		@param condprob_model: Path to a binary conditional probability model.
		For instructions on how to create the model, please refer to the LEXenstein Manual.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link:
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link:
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
		self.model = pickle.load(open(condprob_model, 'rb'))
Esempio n. 39
    def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'):

        # Other Taggers:
        #   1. 'english-bidirectional-distsim.tagger'
        #   2. 'english-left3words-distsim.tagger'

        super(String2POSNGramsList, self).__init__()

        # N-Grams size
        self.n = n

        # Tagger Class Selection... See detail in Stanford Tagger documentation.
        self.tagger_cls = tagger_cls

        # Getting the Stanford tagger instance.
        self.spt = StanfordPOSTagger(self.tagger_cls)
        # self.spt = CoreNLPPOSTagger(url='http://localhost:9000')
        self.spt.java_options = '-mx10g'
    def __init__(self, text):

        self.text = text

        root = os.path.dirname(os.path.realpath(_file_))
        os.environ["STANFORD_PARSER"] = root+
        os.environ["STANFORD_MODELS"] = root+
        _path_to_model = root + ''
        _path_to_jar = root + ''
        self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar)
        self.sentences = sent_tokenize(text.encode("utf-8"))
        self.words = word_tokenize(text.encode("utf-8"))

        self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower())))

        self.taggedBigrams = self.ngramsAndTags(2)
Esempio n. 41
	def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'):
		Creates an instance of the WordVectorSelector class.
		@param vector_model: Path to a binary word vector model.
		For instructions on how to create the model, please refer to the LEXenstein Manual.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link:
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link:
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		@param pos_type: The type of POS tags with which the model's words are annotated, if any.
		Values supported: none, treebank, paetzold
		self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True)
		self.pos_type = pos_type
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
Esempio n. 42
class POSTagSelector:

	def __init__(self, pos_model, stanford_tagger, java_path):
		Creates a POSTagSelector instance.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link:
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link:
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)

	def selectCandidates(self, substitutions, victor_corpus):
		Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
		@param substitutions: Candidate substitutions to be filtered.
		It can be in two formats:
		A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
		Example: substitutions['perched'] = {'sat', 'roosted'}
		A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
		Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
		@param victor_corpus: Path to a corpus in the VICTOR format.
		For more information about the file's format, refer to the LEXenstein Manual.
		@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
		selected_substitutions = []

		substitution_candidates = []
		if isinstance(substitutions, list):
			substitution_candidates = substitutions
		elif isinstance(substitutions, dict):
			void = VoidSelector()
			substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
			print('ERROR: Substitutions are neither a dictionary or a list!')
			return selected_substitutions
		#Read VICTOR corpus:
		lexf = open(victor_corpus)
		sents = []
		targets = []
		heads = []
		words = set([])
		c = -1
		for line in lexf:
			c += 1
			data = line.strip().split('\t')
			sent = data[0].strip().split(' ')
			target = data[1].strip()
			head = int(data[2].strip())
		#Tag sentences:
		tagged_sents = self.tagger.tag_sents(sents)
		#Tag words:
		words = list(words)
		words_sents = [[w] for w in words]
		tagged_words = self.tagger.tag_sents(words_sents)
		word_to_tag = {}
		for i in range(0, len(words)):
			word_to_tag[words[i]] = tagged_words[i][0][1]
		for i in range(0, len(sents)):
			target = targets[i]
			head = heads[i]
			target_pos = str(tagged_sents[i][head][1])
			candidates = []
			candidates = set(substitution_candidates[i])
			candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos)
		return selected_substitutions
	def getTargetPOS(self, sent, target, head):
		pos_data = []
			pos_data = nltk.pos_tag(sent)
			return pos_data[head][1]
		except UnicodeDecodeError:
				pos_data = nltk.pos_tag(target)
				return pos_data[0][1]
			except UnicodeDecodeError:
				return 'None'
	def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos):
		result = set([])
		for candidate in candidates:
			if candidate in word_to_tag:
				ctag = word_to_tag[candidate]
				if ctag==target_pos:
		return result
	def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False):
		Saves a set of selected substitutions in a file in VICTOR format.
		@param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected.
		@param substitutions: The vector of substitutions selected for the VICTOR corpus.
		@param output_path: The path in which to save the resulting VICTOR corpus.
		@param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution.
		o = open(output_path, 'w')
		f = open(victor_corpus)
		for subs in substitutions:
			data = f.readline().strip().split('\t')
			sentence = data[0].strip()
			target = data[1].strip()
			head = data[2].strip()
			newline = sentence + '\t' + target + '\t' + head + '\t'
			for sub in subs:
				newline += '0:'+sub + '\t'
			o.write(newline.strip() + '\n')
Esempio n. 43
# /usr/local/bin/python
# coding: latin-1

import nltk, string, os

from random import randint
from nltk.tag.stanford import StanfordPOSTagger

os.environ['CLASSPATH'] = '../ressources/standforPOS/stanford-postagger.jar'
os.environ['STANFORD_MODELS'] = '../ressources/standforPOS/models'

st = StanfordPOSTagger('german-hgc.tagger')


# Text importieren

f = open("../ressources/reiseberichtIndien.txt")
raw_text =
start_bookmark = raw_text.find("Erstes Kapitel")
end_bookmark = raw_text.rfind("Im Verlag von R")
text = raw_text[start_bookmark:end_bookmark]

#  Tokenization des Textes
Esempio n. 44
	elif tag.startswith('J'):
		result = 'J'
	elif tag.startswith('W'):
		result = 'W'
	elif tag.startswith('PRP'):
		result = 'P'
		result = tag.strip()
	return result

model = '/export/data/ghpaetzold/benchmarking/lexmturk/scripts/evaluators/stanford-postagger-full-2015-04-20/models/english-bidirectional-distsim.tagger'
tagger = '/export/data/ghpaetzold/benchmarking/lexmturk/scripts/evaluators/stanford-postagger-full-2015-04-20/stanford-postagger.jar'
java = '/usr/bin/java'

os.environ['JAVAHOME'] = java
tagger = StanfordPOSTagger(model, tagger, java_options='-Xmx6g')

f = open('ratings.txt')

sents1 = []
sents2 = []
heads1 = []
heads2 = []
for line in f:
	data = line.strip().split('\t')
	word1 = data[1].strip()
	word2 = data[3].strip()
	sent1 = data[5].strip()
	newsent1 = ''
	tokens = sent1.split(' ')
	index1 = -1
Esempio n. 45
class WordVectorSelector:
	def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'):
		Creates an instance of the WordVectorSelector class.
		@param vector_model: Path to a binary word vector model.
		For instructions on how to create the model, please refer to the LEXenstein Manual.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link:
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link:
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		@param pos_type: The type of POS tags with which the model's words are annotated, if any.
		Values supported: none, treebank, paetzold
		self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True)
		self.pos_type = pos_type
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
	def selectCandidates(self, substitutions, victor_corpus, proportion=1.0, proportion_type='percentage', stop_words_file=None, window=99999, onlyInformative=False, keepTarget=False, onePerWord=False):
		Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
		@param substitutions: Candidate substitutions to be filtered.
		It can be in two formats:
		A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
		Example: substitutions['perched'] = {'sat', 'roosted'}
		A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
		Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
		@param victor_corpus: Path to a corpus in the VICTOR format.
		For more information about the file's format, refer to the LEXenstein Manual.
		@param proportion: Percentage of substitutions to keep.
		If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1.
		If proportion_type is set to "integer", then this parameter must be an integer number.
		@param proportion_type: Type of proportion to be kept.
		Values supported: percentage, integer.
		@param stop_words_file: Path to the file containing stop words of the desired language.
		The file must contain one stop word per line.
		@param window: Number of tokens around the target complex sentence to consider as its context.
		@param onlyInformative: If True, only content words are considered as part of the complex word's context, such as nouns, verbs, adjectives and adverbs.
		@param keepTarget: If True, the complex target word is also included as part of its context.
		@param onePerWord: If True, a word in the complex word's context can only contribute once to its resulting word vector.
		@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
		#Initialize selected substitutions:
		selected_substitutions = []
		#Read stop words:
		stop_words = set([])
		if stop_words_file != None:
			stop_words = set([word.strip() for word in open(stop_words_file)])

		#Configure input:
		substitution_candidates = []
		if isinstance(substitutions, list):
			substitution_candidates = substitutions
		elif isinstance(substitutions, dict):
			void = VoidSelector()
			substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
			print('ERROR: Substitutions are neither a dictionary or a list!')
			return selected_substitutions		

		#Parse sentences:
		lexf = open(victor_corpus)
		sents = [line.strip().split('\t')[0].strip().split(' ') for line in lexf]
		tagged_sents = self.tagger.tag_sents(sents)
		#Transform them to the right format:
		if self.pos_type=='paetzold':
			transformed = []
			for sent in tagged_sents:
				tokens = []
				for token in sent:
					tokens.append((token[0], getGeneralisedPOS(token[1])))
			tagged_sents = transformed
		#Rank candidates:
		c = -1
		lexf = open(victor_corpus)
		for line in lexf:
			c += 1
			data = line.strip().split('\t')
			sent = data[0].strip()
			target = data[1].strip()
			head = int(data[2].strip())
			pos_tags = tagged_sents[c]
			target_pos = pos_tags[head][1]
			target_vec = self.getSentVec(sent, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tags)
			candidates = substitution_candidates[c]

			candidate_dists = {}
			for candidate in candidates:
				candidate_vec = self.getWordVec(candidate, target_pos)
					candidate_dists[candidate] = cosine(candidate_vec, target_vec)
				except ValueError:
					candidate_dists = candidate_dists

			final_candidates = self.getFinalCandidates(candidate_dists, proportion, proportion_type)

		return selected_substitutions
	def getSentVec(self, sentence, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tokens):
		informative_tags = set([])
		if onlyInformative:
			if self.pos_type=='treebank':
				informative_tags = set(['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS'])
			if self.pos_type=='paetzold':
				informative_tags = set(['N', 'V', 'J', 'R'])
		tokens = sentence.split(' ')
		valid_tokens = []
		if keepTarget:
			valid = tokens[head].strip()
			if self.pos_type!='none':
				valid += '|||' + pos_tokens[head][1]
		if head>0:
			for i in range(max(0, head-window), head):
				if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags:
					if tokens[i] not in stop_words:
						valid = tokens[i]
						if self.pos_type!='none':
							valid += '|||' + pos_tokens[i][1]
		if head<len(tokens)-1:
			for i in range(head+1, min(len(tokens), head+1+window)):
				if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags:
					if tokens[i] not in stop_words:
						valid = tokens[i]
						if self.pos_type!='none':
							valid += '|||' + pos_tokens[i][1]
		if onePerWord:
			valid_tokens = list(set(valid_tokens))
		result = []
		for	token in valid_tokens:
			if len(result)==0:
					result = self.model[token]
				except Exception:
					result = []
					result = np.add(result, self.model[token])
				except Exception:
					result = result
		result = result/float(len(valid_tokens))
		return result
	def getWordVec(self, candidate, target_pos):
		cand = None
		if self.pos_type!='none':
			cand = candidate + '|||' + target_pos
			cand = candidate

		result = np.array([])
			result = self.model[cand]
		except Exception:
		return result
	def getFinalCandidates(self, candidate_dists, proportion, proportion_type):
		result = sorted(list(candidate_dists.keys()), key=candidate_dists.__getitem__)
		if proportion_type=='percentage':
			return result[0:max(1, int(proportion*float(len(result))))]
		elif proportion_type=='integer':
			if proportion>=len(result):
				return result
				return result[0:max(1, int(proportion))]
			print('Unrecognized proportion type.')
			return result
	def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False):
		Saves a set of selected substitutions in a file in VICTOR format.
		@param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected.
		@param substitutions: The vector of substitutions selected for the VICTOR corpus.
		@param output_path: The path in which to save the resulting VICTOR corpus.
		@param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution.
		o = open(output_path, 'w')
		f = open(victor_corpus)
		for subs in substitutions:
			data = f.readline().strip().split('\t')
			sentence = data[0].strip()
			target = data[1].strip()
			head = data[2].strip()
			newline = sentence + '\t' + target + '\t' + head + '\t'
			for sub in subs:
				newline += '0:'+sub + '\t'
			o.write(newline.strip() + '\n')
Esempio n. 46
## This code extracts the features for several glosses and stores it in two text files to be fed to evaluation. py or

## import everything needed
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from nltk.parse.stanford import StanfordParser
from nltk.tag.stanford import StanfordPOSTagger
import string
from import singularize
import subprocess
import os

## set variables
st = StanfordPOSTagger('german-dewac.tagger')
featuresPhrases = []
finalRatings = []
path = '/home/hanna/Documents/SMOR/'

## read in the word frequencies from DeReWo
derewo = open('derewo-v-ww-bll-320000g-2012-12-31-1.0.txt')
freqWo= []
freqNo= []
for lines in derewo:
	lines = lines.strip()               
	parts = lines.split(" ")
Esempio n. 47
	def __init__(self): = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')
Esempio n. 48
class SenticParser:
	def __init__(self): = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')

	def TaggedSentenceSlashForm(self, sentence ):

		#print sentence.split()
		Tagged =

		TaggedSentence = ""
		for i in Tagged:
			TaggedSentence = TaggedSentence+"/".join(i)+" "

		#print TaggedSentence
		return TaggedSentence

	def TaggedSentence(self, sentence ):
		Tagged =
		return Tagged

	def FindStemmedVerb(self, word):
		st = LancasterStemmer()
		StemmedVerb = st.stem(word)
		dic = enchant.Dict("en_US")
		if( dic.check(StemmedVerb) ):
			return StemmedVerb
			return StemmedVerb+"e"			

	def FindSplit(self, sentence, TaggedSentence):
		TokenizedSentence = nltk.word_tokenize(sentence)

		SplitList = []
		SentAdded = ""
		split = 0 

		#print TaggedSentence

		for i in range(len(TaggedSentence)):
			if TaggedSentence[i][1].startswith("VB"):
					if (TaggedSentence[i+1][1].startswith("VB")):
						SentAdded = ""
						SentAdded = TaggedSentence[i][0]+" "
					#	print "split"
				#print SentAdded
				SentAdded = SentAdded + TokenizedSentence[i] + " "

		Str_list = filter(None, SplitList)
		Str_list = list(set(Str_list))

		for i in range(len(Str_list)):
			Str_list[i] = Str_list[i][:-1].translate(string.maketrans("",""), string.punctuation)
		return Str_list
Created on Mar 11, 2016

@author: zhongzhu
import os

from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger

st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
st.tag('What is the airspeed of an unladen swallow ?'.split())

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))

dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
Esempio n. 50
class Parser:

    def __init__(self):
        self.MatchList = []
        self.ConceptMatches = [] = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')

    def SyntacticMatch(self, concept1, concept2 ):                      # Checks for syntactic similarity. Checks for matching words between two concepts. 
	TaggedConcept1 =
	TaggedConcept2 =     

	print TaggedConcept1
	print TaggedConcept2

	flag = 0 

	for i in TaggedConcept1:
		for j in TaggedConcept2:
			if (i == j):
				if i[1].startswith("NN"):
					flag = 1

	if ( flag == 1):
		return True
		return False

    def FindBigrams(self, concept):                                      # Finds All Bigrams associated with the concept
        #sentence = concept.split(" ")                     	         # Splits the Given concept into Bigrams     e.g) "a very special christmas gift" gets split as ["a very", "very special", "special 																	christmas", "christmas gift"]

	sentence =        
	print sentence

	Bigrams = []										
	for i in range(len(sentence) - 1):
            if ( sentence[i][1] == "JJ"  and sentence[i+1][0] in stopwords.words('english') ):		# If the bigram is [ adj + stopword ] , ignore
                continue 									           # bigrams like "a very" are ignored
	    elif ( sentence[i][0] in stopwords.words('english')  and sentence[i+1][0] in stopwords.words('english') ):		# If the bigram is [ adj + stopword ] , ignore

            elif ( sentence[i+1][1] == "JJ"  and sentence[i][0] in stopwords.words('english') ):            # If the bigram is [ stopword + adj ] , ignore 
                continue									           # bigrams like "amazingly a" is ignored

            elif ( sentence[i][1] == "JJ" and sentence[i+1][1].startswith("NN") ):                       # If the bigram is [ adj + concept ] , then include [adj + concept] and [concept] to the list
                Bigrams.append(sentence[i+1][0])						 # e.g) "special christmas" --> concepts extracted will be "special christmas" and "christmas" are added
                Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0])
            elif ( sentence[i][0] in stopwords.words("english") and sentence[i+1][1].startswith("NN") ):       # If the bigram is [ stopword + concept ], then inlcude only the concept w/ and w/o the concept 
                    Bigrams.append(sentence[i+1][0])                                                                 # e.g) "the christmas" --> concepts that will be extracted is "christmas" , "the christmas"
		    Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0])						          
	    elif ( sentence[i][1].startswith("NN") and sentence[i+1][1] == "JJ" ):							       # If the bigram ends with adjective , then ignore the adjective. 
                Bigrams.append(sentence[i][0])    							              # e.g) "present amazing" --> concept that will be extracted is "present"
            elif ( sentence[i][1].startswith("NN") and sentence[i+1][0] in stopwords.words("english")):					# If the bigram ends with a stopword , then ignore the stopword
                    Bigrams.append(sentence[i][0])							              # e.g) "christmas the" --> concept that will be extracted is "christmas"
                Bigrams.append(sentence[i][0]+ " "+ sentence[i+1][0])
        print Bigrams

        return Bigrams
Esempio n. 51
class NltkHelper:

	def __init__(self, text):

		self.text = text

		root = os.path.dirname(os.path.realpath(__file__))
		os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar"
		os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/"
		_path_to_model  = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
		_path_to_jar    = root + '/stanford-postagger/stanford-postagger.jar'
		self.stanford   = StanfordPOSTagger(_path_to_model, _path_to_jar)

		self.sentences  = sent_tokenize(text.encode("utf-8"))
		self.words      = word_tokenize(text.encode("utf-8"))

		self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) ))
		#cleanWords = self.cleanWords()
		#self.tags = self.stringifyTuples(self.stanford.tag( cleanWords ))
		#print self.cleanWords()
		self.taggedBigrams = self.ngramsAndTags(2) 

		#print self.words
		#print self.cleanWords()
		#print "Bigrams --> ", self.taggedBigrams
		#print "Tags --> ", self.findTags()
		#print (nouns)
	def personal_names(self):
		output = []

		#(('reports', 'NNS'), ('claim', 'VBP'))
		for gram in self.taggedBigrams:
			tag1  = gram[0][1]
			tag2  = gram[1][1]
			word1 = gram[0][0]
			word2 = gram[1][0]

			if self.isPersonalName( tag1 ) and self.isPersonalName( tag2 ):
				output.append( "{0} {1}".format(word1, word2) )
		return output

	def isPersonalName(self, tag):
		return tag == "NNP" or tag == "FW"

	def preprocessTitle(self):
		output = ''
		for taggedWord in self.tags:
			word = taggedWord[0]
			tag  = taggedWord[1]

			if self.isPersonalName(tag):
				output = "{0} {1}".format(output, word.title())
				output = "{0} {1}".format(output, word.lower())

		return output
	def ngramsAndTags(self, n):
		output = []
		for i in range(len(self.tags)-n+1):
			gram = (self.tags[i],)
			for j in range(i+1, i+n):
				gram += ( self.tags[j], )
	    		output.append( gram )

		return output

	def sortFrequencies( self, ngram ):
		return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)		    

	def findTags(self):
		#pattern = [("AJ", NOUN/S/FWS), (FW, FW), NOUN, NOUN]
		output = []

		#(('reports', 'NNS'), ('claim', 'VBP'))
		for gram in self.taggedBigrams:
			tag1  = gram[0][1]
			tag2  = gram[1][1]
			word1 = gram[0][0]
			word2 = gram[1][0]

			if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ):
				output.append( "{0} {1}".format(word1, word2) )
		return output

	def isAdj(self, tag):
		return tag=='JJ'

	def isNounOrForeignWord(self, tag):
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		return tag in nouns

	def bigramsList(self):		
	def stringifyList(self, list):
		output = []
		for tag in list:
			output.append( str(tag.encode('utf-8')) )
		return output

	def stringifyTuples(self, tuples):
		output = []
		for tag in tuples:
			output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) )
		return output

	returns list of tuples of tagged words in text
	def analyze(self):
		output = []
		for sentence in self.sentences:
			taggedWords = self.stanford.tag( word_tokenize( sentence.lower() ) )

		return self.stringifyTuples(taggedWords)

	returns list of nouns and foreign words
	def filterNounsInText(self):
		output = set()
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']

		for sentence in self.sentences:
			taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
			for item in taggedWords:
				if item[1] in nouns:
					output.add( item[0] )
		return self.stringifyList( list(output) )


	def cleanWords(self):
		input = ''
		for item in self.words:
			input = "{0} {1}".format(input, item)

		input = re.sub('\n+', " ", input)
		input = re.sub('\[[0-9]*\]', "", input)
		input = re.sub(' +', " ", input)
		input = bytes(input)
		input.decode('ascii', 'ignore')

		input = input.split(" ")
		cleanInput = []

		for item in input:
			item = item.strip( string.punctuation )

			if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
			    cleanInput.append( item )

		return cleanInput

	def bigramNouns(self, text):
		nouns = self.filterNouns(text)		

	def isTagNounOrForeignWord(self, word):
		output = False
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		taggedWords = self.stanford.tag( word.lower()  )
		for item in taggedWords:
			if item[1] in nouns:
				output = True
		return output

	def filterNouns(self, input):
		output = set()
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		sentences = sent_tokenize(input)
		for sentence in sentences:
			taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
			for item in taggedWords:
				if item[1] in nouns:
					output.add( item[0] )
		nList = list(output)
		return self.stringifyTuples(nList)

	def define( self, word ):	

		definitions = []	
			synsets = wn.synsets(word)
			for synset in synsets:
				definitions.append (synset.definition())
		except ValueError:
			print "Cannot define '{0}'".format(word)

		return definitions

	def sentenceExamples( self, noun):
		output = []
			synsets = wn.synsets(noun)
			for synset in synsets:
				examples = synset.examples()
				for example in examples:
					output.append( example )
		except ValueError, AttributeError:
			print "Cannot find any example for '{0}'".format(noun)

		return output
Esempio n. 52
os.environ['STANFORD_MODELS'] = \

parser = stanford.StanfordParser(model_path= \

parsed_sentences = parser.raw_parse( \

for i in parsed_sentences:
    for k in i:

for line in parsed_sentences:
    for sentence in line:


st = StanfordPOSTagger(r'C:/stanford_data/english-bidirectional-distsim.tagger',r'C:/stanford_data/stanford-postagger.jar')

bobo = st.tag(my_sentence.split())


for i in bobo:
Esempio n. 53
from nltk.tag.stanford import StanfordPOSTagger
import nltk
import os

os.environ['CLASSPATH'] = "/home/vishesh/Downloads/stanford-postagger-full-2015-12-09/"

english_postagger = StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

print english_postagger.tag(nltk.word_tokenize('this is stanford postagger in nltk for python users'))

fo = open('','r')
data =

fw = open('europarl_tags_testing.txt','w')

data = data.decode('utf-8')
data = data.split('\n')

#tokens = data.split()
#print len(tokens)

#print 'Tagging...'

german_postagger = StanfordPOSTagger('/home/vishesh/Documents/NLP/postagger/models/german-fast-caseless.tagger')
for i in range(10000,11500):
	tokens = nltk.word_tokenize(data[i])
	tags = german_postagger.tag(tokens)
Esempio n. 54
from gensim import matutils
from math import log
from collections import Counter, defaultdict

from sklearn.decomposition import NMF

from nltk.tokenize import TweetTokenizer
import nltk
import re
import os
path = '/home/kaminem64/stanford'
os.environ['CLASSPATH'] = '%s/stanford-postagger-full-2015-04-20/stanford-postagger.jar:%s/stanford-ner-2015-04-20/stanford-ner.jar:%s/stanford-parser-full-2015-04-20/stanford-parser.jar:%s/stanford-parser-full-2015-04-20/stanford-parser-3.6.0-models.jar' %(path, path, path, path)
os.environ['STANFORD_MODELS'] = '%s/stanford-postagger-full-2015-04-20/models:%s/stanford-ner-2015-04-20/classifiers' %(path, path)
from nltk.tag.stanford import StanfordPOSTagger

stanford_pos_tag = StanfordPOSTagger('english-bidirectional-distsim.tagger')

import xlsxwriter
workbook = xlsxwriter.Workbook('topic_modeling.xlsx')
worksheet = workbook.add_worksheet()
row_num = 0
worksheet.write(row_num, 0, 'store_app_id')
worksheet.write(row_num, 1, 'name')
worksheet.write(row_num, 2, 'start_date')
worksheet.write(row_num, 3, 'end_date')
worksheet.write(row_num, 4, 'release_note')
worksheet.write(row_num, 5, 'topics')

app_ids = [307906541]#, 282614216, 383298204, 421254504, 509993510, ]
previous_date = None
Esempio n. 55
class Parser(object):
    modeldir = os.path.abspath(BASE_DIR + "/weiss/planner/models/")
    stopword_path = modeldir + "/english.stp"

    def __init__(self):
        self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger',
                                           self.modeldir + '/postagger/stanford-postagger.jar')
        self._stemmer = nltk.SnowballStemmer("english")
        self._stopwords = stopword(self.stopword_path)
        self._type_words = self._set_type_words()
        self._sentiment = self._get_sentiment()

    def _get_sentiment(self):
        sentiment = {}
        for line in open(self.modeldir + "/AFINN.txt"):
            word, score = line.split('\t')
            sentiment[word] = int(score)
        return sentiment

    def calculate_sentiment(self, query):
        tokens = nltk.word_tokenize(query)
        score = 0
        for token in tokens:
            if token in self._sentiment:
                score += self._sentiment[token]
        return score

    def entity_recognition(self, query, arguments):
        """Parse query and extract keywords

        This function is called in planner

            query: query needs to be parsed
            arguments: info needs to be updated
        tokens = nltk.word_tokenize(query)
        tags = self._postagger.tag(tokens)

        tuples = []

        for tag in tags:
            if tag[0] in self._stopwords:
            stemmed = self._stemmer.stem(tag[0])
            if stemmed in self._type_words['movie']:
            if stemmed in self._type_words['article']:
            if stemmed in self._type_words['restaurant']:
            if tag[1][:2] == 'NN' or tag[1][:2] == 'JJ':

        if len(tuples) > 0:
            arguments['keywords'] = tuples
  "Here are the keywords: %s" % arguments['keywords'])

    def _set_type_words(self):
        """Initialize synonymy words of movie, article and restaurant

        This function is called during initialization

        Return: A dictionary, key: movie, article, restaurant, value: their synonymy words
        topic = {}
        movie = ['cinema', 'show', 'film', 'picture', 'cinematograph',
                 'videotape', 'flick', 'pic', 'cine', 'cinematics', 'photodrama',
                 'photoplay', 'talkie', 'flicker', 'DVD', 'movie']
        article = ['report', 'announcement', 'story', 'account',
                   'newscast', 'headlines', 'press', 'communication', 'talk', 'word',
                   'communique', 'bulletin', 'message', 'dispatch', 'broadcast',
                   'statement', 'intelligence', 'disclosure', 'revelation',
                   'gossip', 'dispatch', 'news', 'article']
        restaurant = ['bar', 'cafeteria', 'diner', 'dining', 'saloon', 'coffeehouse',
                      'canteen', 'chophouse', 'drive-in', 'eatery', 'grill', 'lunchroom', 'inn', 'food',
                      'pizzeria', 'hideaway', 'cafe', 'charcuterie', 'deli', 'restaurant']
        for m in movie:
            topic.setdefault('movie', set([]))
        for a in article:
            topic.setdefault('article', set([]))
        for r in restaurant:
            topic.setdefault('restaurant', set([]))
        return topic

    def type_recognition(self, query, arguments):
        """Identity the type of the topic: movie, article or restaurant

        This is called in planner

            query: query needs to be parsed
            arguments: info needs to be updated

        tokens = nltk.word_tokenize(query)
        first = self._stemmer.stem(tokens[0])
        last = self._stemmer.stem(tokens[-1])
        lastsecond = self._stemmer.stem(tokens[-2]) if len(tokens) > 1 else "toy"
        if (first in self._type_words['article'] or last in self._type_words['article']
            or lastsecond in self._type_words['article']):
            arguments['tid'] = Type.News
        elif (first in self._type_words['restaurant'] or last in self._type_words['restaurant']
              or lastsecond in self._type_words['restaurant']):
            arguments['tid'] = Type.Restaurant
        elif (first in self._type_words['movie'] or last in self._type_words['movie']
              or lastsecond in self._type_words['movie']):
            arguments['tid'] = Type.Movie
            arguments['tid'] = Type.Unknown

    def _string_to_idx(number):
        if number == 'first' or number == 'one':
            return 0
        if number == 'second' or number == 'two':
            return 1
        if number == 'third' or number == 'three':
            return 2
        if number == 'fourth' or number == 'four':
            return 3
        if number == 'fifth' or number == 'five':
            return 4

    def keyword_matching(arguments, entities):
        words = arguments['keywords']
        phonics = set([])
        overlap = []

        for w in words:

        for i in xrange(0, len(entities)):
            entity_name = nltk.word_tokenize(entities[i].name)
            entity_phonics = set([])
            for word in entity_name:
            common = len(phonics & entity_phonics) / len(entity_phonics)
            if common == 1:
                arguments['idx'] = i
        arguments['idx'] = overlap.index(max(overlap))

    def find_number(self, query, arguments, entities):
        tokens = nltk.word_tokenize(query)
        tags = self._postagger.tag(tokens)
        last = query.find('last')

        # Edge case, "first" cannot be tagged correctly
        if len(query.split(" ")) <= 3 and query.find('first') != -1:
            arguments['idx'] = 0

        number = None
        for t in tags:
            if t[1] == 'JJ' and t[0][-2:] in set(['th', 'nd', 'st', 'rd']):
                number = t[0]
            elif t[1] == 'CD' and t[0]:
                number = t[0]
                if number.isdigit() and int(number) < 6:
                	arguments['idx'] = int(number) - 1

        if number is not None:
            if last == -1:
                arguments['idx'] = self._string_to_idx(number)
                arguments['idx'] = len(entities) - self._string_to_idx(number) - 1
class NLTKHelper(object):
    """docstring for NLTKHelper"""
    def __init__(self, text):

        self.text = text

        root = os.path.dirname(os.path.realpath(_file_))
        os.environ["STANFORD_PARSER"] = root+
        os.environ["STANFORD_MODELS"] = root+
        _path_to_model = root + ''
        _path_to_jar = root + ''
        self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar)
        self.sentences = sent_tokenize(text.encode("utf-8"))
        self.words = word_tokenize(text.encode("utf-8"))

        self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower())))

        self.taggedBigrams = self.ngramsAndTags(2)
        #print self.words

    def  personal_names(self):
        output = []

        for  gram in self.taggedBigrams:
            tag1  = gram[0][1]
            tag2  = gram[1][1]
            word1 = gram[0][0]
            word2 = gram[1][0]

            if  self.isPersonalName(tag1) and self.isPersonalName(tag2):
                output.append("{0} {1}".format(word1, word2))
        return output

    def isPersonalName(self, tag):
        return tag == "NNP" or tag == "FW"

    def preprocessTitle(self):

        output = ''
        for taggedWord in self.tags:
            word = taggedWord[0]
            tag  = taggedWord[1]

            if self.isPersonalName(tag):
                output = "{0} {1}".format(output, word.title())
                output = "{0} {1}".format(output, word.lower())

            return output

    def ngramsAndTags(self, n):
        output = []
        for i in range(len(self.tags)-n+1):
            gram = (self.tags[i],)
            for j in range(i+1, i+n):
                gram +=(self.tags[j], )
        return output

    def sortFrequencies(self, ngram):
        return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)

    def findTags(self):
        output = []

        for gram in self.taggedBigrams:
            tag1 = gram[0][1]
            tag2 = gram[1][1]
            word1 = gram[0][0]
            word2 = gram[1][0]

            if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ):
                output.append( "{0} {1}".format(word1, word2) )
            return output

    def isAdj(self, tag):
        return tag=='JJ'

    def isNounOrForeignWord(self, tag):
        nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
        return tag in nouns

    def stringifyList(self, list):
        output = []
        for tag in list:
            output.append( str(tag.encode('utf-8')) )
        return output

    def stringifyTuples(self, tuples):
        output = []
        for tag in tuples:
            output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) )
        return output

    #returns list of tuples of tagged words in text
    def analyze(self):
        output = []
        for sentence in self.sentences:
            taggedWords = self.stanford.tag(word_tokenize(sentence.lower()))

            return self.stringifyTuples(taggedWords)

    def filterNounsInText(self):
        output = set()nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']

        for sentence in self.sentences:
            taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
            for item in taggedWords:
                if item[1] in nouns:
                    output.add( item[0] )
        return self.stringifyList( list(output) )

    def cleanWords(self):
        input = ''
        for item in self.words:
            input = "{0} {1}".format(input, item)

        input = re.sub('\n+', " ", input)
        input = re.sub('\[[0-9]*\]', "", input)
        input = re.sub(' +', " ", input)
        input = bytes(input)
        input.decode('ascii', 'ignore')

        input = input.split(" ")
        cleanInput = []

        for item in input:
            item = item.strip( string.punctuation )

            if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
                cleanInput.append( item )

        return cleanInput

    def bigramNouns(self, text):
        nouns = self.filterNouns(text)

    def isTagNounOrForeignWord(self, word):
        output = False
        nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
        taggedWords = self.stanford.tag( word.lower()  )
        for item in taggedWords:
            if item[1] in nouns:
                output = True
        return output

    def filterNouns(self, input):
        output = set()
        nouns = ['NN', 'NNS', 'NNPS', 'FW']
        sentences = sent_tokenize(input)
        for sentence in sentences:
            taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
            for item in taggedWords:
                if item[1] in nouns:
                    output.add( item[0] )
        nList = list(output)
        return self.stringifyTuples(nList)

    def define(self, word):

        definitions = []
            synsets = wn.synsets(word)
            for synset in synsets:
            except ValueError:
                print "Cannot define '{0}'".format(word)

            except definitions
Esempio n. 57
# -*- coding: utf-8 -*-
import nltk
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize

#the path where you have downloaded and unziped the full parser.
sp_dir = '/home/sarah/postagger/'
english_model = sp_dir + 'models/english-bidirectional-distsim.tagger'
chinese_model = sp_dir + 'models/chinese-distsim.tagger'
jar_path = sp_dir + 'stanford-postagger.jar'

#testing the english POS tagger
print "For the English model"
st_eng = StanfordPOSTagger(model_filename = english_model, path_to_jar = jar_path)
eng_sent = 'This is Stanford postagger in nltk for Python users.'
print eng_sent
eng_tokens = word_tokenize(eng_sent)
eng_tagged = st_eng.tag(eng_tokens)
for i in eng_tagged:
	print i

#testing for the chinese POS tagger
print "\n\nFor the Chinese model"
st_chi = StanfordPOSTagger(model_filename = chinese_model, path_to_jar = jar_path,encoding = 'utf-8')
chi_sent = '这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'
print chi_sent
chi_tokens = word_tokenize(chi_sent)
chi_tagged = st_chi.tag(chi_tokens)
for i in chi_tagged:
	print i
#print st_chi.tag('这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'.split())
from nltk import pos_tag,word_tokenize
#from Utils import getQues
#txt="benim adim yahya"
from nltk.tag.stanford import StanfordPOSTagger
txt="i am dentist"
print  tgr.tag(word_tokenize(txt))