コード例 #1
0
def tagged_def():
    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path
    tagger = StanfordPOSTagger(
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger',
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger'
    )

    path_data = "data" + os.sep + "items_tagged_modified.json"
    data = json.load(codecs.open(path_data, encoding='UTF-8'))
    for item in data:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            definition = pos2def["definition"]
            #             print chardet.detect(definition)
            print definition.encode('gbk')
            definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
            tokens = nltk.word_tokenize(definition_pure)
            #             print tokens
            for token in tokens:
                print chardet.detect(token)
            tagged_tokens = tagger.tag(definition_pure.encode('utf-8').split())
            pos2def['tagged_def'] = tagged_tokens

    path_tagged_output = "items_tagged_auto.json"
    json.dump(data,
              codecs.open(path_tagged_output, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
コード例 #2
0
def getUsername(message, *args):
    pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8")
    words = nltk.word_tokenize(message.lower())
    tagged_words = pos_tagger.tag(words)
    sug_usernames = []
    # Check if pervious username input is passed
    if len(args) > 0:
        previous_username = args[0]
        sug_usernames = [
            word for word, tag in tagged_words
            if tag in ['NN', 'NNP', 'FW', 'NNPS'] and word != previous_username
        ]
    else:
        sug_usernames = [
            word for word, tag in tagged_words
            if tag in ['NN', 'NNP', 'FW', 'NNPS']
        ]

    if len(sug_usernames) > 0:
        if getSentenceSentiment(message) == 'pos':
            return sug_usernames[-1]
        else:
            return sug_usernames[
                -1] + 'salt123'  # return last suggested username

    return 'randomuser567user'
コード例 #3
0
def extractor():
    st = StanfordPOSTagger(
        '../stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger',
        '../stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar')
    nouns = []
    pnouns = []
    i = 0

    with open('../data/scraped_text_NYT.txt', 'r',
              encoding='utf-8') as inputFile:
        comment = inputFile.readline()
        while comment != "":
            sentences = sent_tokenize(comment, 'english')

            for sent in sentences:
                if (sent.strip() == ""):
                    continue
                pos_tags = st.tag(sent.split())
                for pos_tag in pos_tags:
                    if (pos_tag[1] == 'NN' or pos_tag[1] == 'NNS'):
                        nouns = nouns + [pos_tag[0]]
                    elif (pos_tag[1] == 'NNP' or pos_tag[1] == 'NNPS'):
                        pnouns = pnouns + [pos_tag[0]]
            i = i + 1
            print(i)
            print(comment)
            comment = inputFile.readline()

    outFile = open('../data/nouns_scraped_text_NYT.txt', 'a')
    outFile.write('NOUNS:\n')
    for noun in nouns:
        outFile.write(noun + "\n")
    outFile.write('\n\nPNOUNS:\n')
    for pnoun in pnouns:
        outFile.write(pnoun + '\n')
    def __init__(self):

        # stanford ner tagger
        from nltk.tag.stanford import StanfordNERTagger
        self.ner_stanford = StanfordNERTagger(
            '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz',
            '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar'
        )

        # stanford pos tagger
        from nltk.tag.stanford import StanfordPOSTagger
        self.pos_stanford = StanfordPOSTagger(
            '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',
            '/home/harish/Documents/softwares/running/corenlp/stanford-postagger-full-2015-04-20/stanford-postagger.jar'
        )

        # spacy ner tagger
        import spacy
        self.ner_spacy = spacy.load('en')

        # wordnet lemmatizer
        from nltk.stem.wordnet import WordNetLemmatizer
        self.lemmatizer = WordNetLemmatizer()

        self.tagged_output = {}
コード例 #5
0
def test_StanfordAndNLTKPOS():
    import nltk
    from nltk.tag.stanford import StanfordPOSTagger
    sent = 'a low-calorie sweetener that reduces caries activity and the growth and transmission of S. mutans.'
    sent = 'a wire formed by drawing a cast structure through a die; used in dentistry for partial denture clasps and orthodontic appliances.'
    sent = 'readily stained with acid dyes.'
    print chardet.detect(sent)
    #     sent='technique metered spray refers to a topical anesthetic dispersal technique that controls the amount and rate at which a drug is administered.'
    #     sent='older term for a traumatic ulcer of the oral mucosa.'
    #     sent='one or more vertically parallel surfaces of abutment teeth shaped to direct the path of placement and removal of a remarkable partial denture. Also called guiding plane.'
    #     sent='agents that bond, seal, or cement particles or objects together.'
    #     sent='teeth that are at such an angle as to cause them to be out of centric contact with opposing teeth during occlusion.'
    start = datetime.now()
    text = nltk.word_tokenize(sent)
    nltk_pos = nltk.pos_tag(text)

    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path
    stanford_tagger = StanfordPOSTagger(
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger',
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger'
    )
    stanford_pos = stanford_tagger.tag(text)
    print 'nltk_pos: ' + str(nltk_pos)
    print 'stanford_pos: ' + str(stanford_pos)
コード例 #6
0
ファイル: FeatureUnion.py プロジェクト: squidnee/lingo-bean
 def tagWordsInSentences(self, studying, entry):
     '''Tags the part of speech for each word.'''
     jar_path = 'stanford-postagger-full/stanford-postagger.jar'
     if studying in self.english:
         words = parseWordsFromEntry(entry)
         tagged_words = tagWords(words)
         return tagged_words
     elif studying in self.japanese or self.korean or self.mandarin:
         #segmenter = TinySegmenter()
         #words = segmenter.tokenize(entry)
         rm = RakutenMA()
         tagged_words = rm.tokenize(entry)
         #mecab = Mecab()
         #tagged_words = mecab.pos(entry)
         return tagged_words
     else:
         if studying in self.spanish:
             model_path = 'stanford-postagger-full/models/spanish.tagger'
             words = parseWordsFromEntry(entry)
         elif studying in self.french:
             model_path = 'stanford-postagger-full/models/french.tagger'
             words = parseWordsFromEntry(entry)
         postagger = StanfordPOSTagger(model_path,
                                       jar_path,
                                       encoding='utf8')
         tagged_words = postagger.tag(words)
         return tagged_words
コード例 #7
0
def clean_words(tokens, filterStopwords=False, filterPos=None):
	cleanTokens = []
	stopwordList = stopwords.words('spanish')
	
	if filterPos:
		tagger = StanfordPOSTagger('stanford/models/spanish.tagger', 'stanford/stanford-postagger.jar', encoding='utf8')

	for token in tokens:
		cleanToken = token
		for char in string.punctuation:
			cleanToken = cleanToken.replace(char, "")
		
		if filterPos and not filterStopwords:
			res = tagger.tag([cleanToken])
			if len(res)>0:
				word, pos = res[0]
				if pos[0] in filterPos:
					cleanTokens.append(cleanToken)
		
		elif filterStopwords and not filterPos:
			if cleanToken not in stopwordList:
				cleanTokens.append(cleanToken)
		
		elif filterStopwords and filterPos:
			res = tagger.tag([cleanToken])
			if len(res)>0:
				word, pos = res[0]
				if cleanToken not in stopwordList and pos[0] in filterPos:
					cleanTokens.append(cleanToken)

		elif not filterStopwords and not filterPos:
			cleanTokens.append(cleanToken)
	
	return cleanTokens
コード例 #8
0
def posTagging():
    #myNounPhrases = []
    myCompletePOSStructure = []
    a = ['NNP', 'NNPS'] #Avoid NN,NNS. Only NNP , NNPS for purpose of NER.
    print '######## POS'
    english_postagger = StanfordPOSTagger(
        './Masters-Passau/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger',
        './Masters-Passau/stanford-postagger-full-2016-10-31/stanford-postagger.jar')
    #abc = english_postagger.tag('Steve Jobs was Founder of Apple. He was born in United States of America'.split())
    abc = english_postagger.tag('Who was the CEO of IBM'.split())
    print abc
    for number in abc:
        #print number[0],number[1]
        someTup = (number[0].encode('utf8'),number[1].encode('utf8'))
        #print someTup
        myCompletePOSStructure.append(someTup)

        #print split1[0] + ' ' + split1[1]
        #print unicodedata.normalize('NFKD', split1[0]).encode('ascii','ignore')
        #print unicodedata.normalize('NFKD', split1[1]).encode('ascii', 'ignore')

    print myCompletePOSStructure

    for number in abc:
        if any(x in number for x in a):
            #print number
            split1 = str(number).split(',')
            split2 = str(split1[0]).split('u')
            # print split2[1].replace("'", "")
            myNounPhrases.append(number)
コード例 #9
0
    def __init__(self, translation_id):
        # Specify paths to Stanford taggers
        STANFORD_POS_TAGGER_LOCATION = os.environ['STANFORD_POS']
        english_modelfile = '{}/models/english-bidirectional-distsim.tagger'.format(STANFORD_POS_TAGGER_LOCATION)
        spanish_modelfile = '{}/models/spanish-distsim.tagger'.format(STANFORD_POS_TAGGER_LOCATION)
        jarfile = '{}/stanford-postagger-3.7.0.jar'.format(STANFORD_POS_TAGGER_LOCATION)

        # Set Translation ID
        self.translation_id = translation_id

        # Initialize taggers
        self.en_tagger = StanfordPOSTagger(model_filename=english_modelfile, path_to_jar=jarfile)
        self.es_tagger = StanfordPOSTagger(model_filename=spanish_modelfile, path_to_jar=jarfile)

        # Store the string literals from the VA3 files
        self.va3l1 = []
        self.va3l2 = []

        # Store tokenized plaintext sentences
        self.l1_tok_sent = []
        self.l2_tok_sent = []

        # Store the alignments as lists of lists of ints
        self.l1_alignments = []
        self.l2_alignments = []

        # Stor the POS tags as lists of strings
        self.l1_pos_tags = []
        self.l2_pos_tags = []
コード例 #10
0
ファイル: crf.py プロジェクト: lucaslioli/de-identification
def pos_tagging(docs, stanford_path, pos_tagger):
    print("\nGenerating Part-of-Speech tags...")

    # Configuring Stanford NLP POS tagger
    path_to_model = "{}/models/{}.tagger".format(stanford_path, pos_tagger)
    path_to_jar = "{}/stanford-postagger.jar".format(stanford_path)

    tagger = StanfordPOSTagger(model_filename=path_to_model,
                               path_to_jar=path_to_jar)
    # Setting higher memory limit for long sentences
    tagger.java_options = '-mx8192m'

    data = []
    for doc in progressbar.progressbar(docs):
        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        try:
            # Perform POS tagging
            tagged = tagger.tag(tokens)
        except:
            continue

        # Take the word, POS tag, and its label
        data.append([(w, pos, label)
                     for (w, label), (word, pos) in zip(doc, tagged)])
    return data
コード例 #11
0
def get_tagger():
    ''' Set up & return the Stanford Tagger object.'''
    path_to_model = "/home/avery/Applications/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger"
    path_to_jar = "/home/avery/Applications/stanford-postagger-2018-02-27/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = "-mx8192m"
    # Use: tagger.tag(word_tokenize(string))
    return tagger
コード例 #12
0
 def __init__(self):
     """
     Initializes the tagger object
     """
     self.model = TAGGER_MODEL
     self.jar_file = POS_TAGGER_JAR_FILE
     self.tagger = StanfordPOSTagger(self.model, self.jar_file)
     self.tagger_type = STANFORD_TAGGER_NAME
コード例 #13
0
    def part_of_speech_tagging(
            self, words: List[str],
            multi_word_name_entities: Set[str]) -> List[Tuple[str, str]]:
        """
        perform part-of-speech tagging using StanfordPOSTagger
        :param words: a list of words in a sentence
        :param multi_word_name_entities: a set of multi-word name entities
        :return: part-of-speech tag of the sentence
        """
        # define pos tagger
        path_to_model = 'stanford/pos/english-bidirectional-distsim.tagger'
        path_to_jar = 'stanford/pos/stanford-postagger.jar'
        pos_tagger = StanfordPOSTagger(path_to_model, path_to_jar)

        stan_pos_tag = pos_tagger.tag(words[:-1])  # omit the last period
        normal_pos_tag = nltk.pos_tag(words[:-1])  # omit the last period

        # print('Stanford POS tagging:', stan_pos_tag)        # for comparison
        # print('nltk.pos_tag tagging:', normal_pos_tag)      # for comparison

        def post_treatment(stan_pos_tag: List[Tuple[str, str]],
                           norm_pos_tag: List[Tuple[str, str]],
                           multi_word_name_entities: Set[str]) -> None:
            """
            combine the multi-word name entities
            since nltk.pos_tag label multi-word name entities together, so I correct stan_pos_tag by using norm_pos_tag
            the problem of norm_pos_tag is that it usually mislabels words, and that's why I prefer to use StanfordPOStagger
            :param stan_pos_tag: a list of pos-tags of sentences using stanford pos tagger
            :param norm_pos_tag: a list of pos-tags of sentences using nltk.pos_tag
            """
            stan_len = len(stan_pos_tag)
            norm_len = len(normal_pos_tag)
            stan_i = 0
            norm_i = 0
            while stan_i < stan_len and norm_i < norm_len:
                stan_word, stan_pos = stan_pos_tag[stan_i]
                norm_word, norm_pos = norm_pos_tag[norm_i]
                # check if word exists in multi_word_name_entities
                if stan_word == norm_word.split(
                        ' ')[0] and norm_word in multi_word_name_entities:
                    # scan the following words in stan_pos_tag and combine if they can form a multi-word entity
                    temp_i = stan_i + 1
                    match_idx = 1
                    entities = norm_word.split(' ')
                    while temp_i < stan_len and match_idx < len(entities):
                        temp_word, temp_pos = stan_pos_tag[temp_i]
                        if temp_word == entities[match_idx]:
                            _ = stan_pos_tag.pop(temp_i)
                            match_idx += 1
                        else:
                            break
                    stan_pos_tag[stan_i] = (norm_word, stan_pos)
                stan_i += 1
                norm_i += 1

        post_treatment(stan_pos_tag, normal_pos_tag, multi_word_name_entities)

        return stan_pos_tag
コード例 #14
0
ファイル: POSTagger.py プロジェクト: mdth/Masterarbeit
class POSTagger:
    """POSTagger creates a POS tagger for german language. Different tagger are available to use."""
    STAN = "stanford-hgc-tagger"
    SFT = "stanford-fast-tagger"
    TT = "tree-tagger"
    SPACY = "spacy-tagger"

    # paths to Stanford tagger modules
    __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar"
    __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/"

    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
        else:
            raise Exception("Wrong tagger parameter.")

    def tag(self, text):
        """POS tag tokenized text."""
        if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN:
            tokens = self.__tokenizer.tokenize(text)
            return self.__tagger.tag(tokens)
        elif self.tagger_name == POSTagger.TT:
            tags = self.__tagger.tag_text(text)
            tuple_list = []
            tag_list = treetaggerwrapper.make_tags(tags)
            for item in tag_list:
                tuple_list.append((item[0], item[1]))
            return tuple_list
        elif self.tagger_name == POSTagger.SPACY:
            tags = self.__tagger(text)
            tuple_list = []
            for word in tags:
                tuple_list.append((word.orth_, word.tag_))
            return tuple_list
        else:
            pass

#tagger = POSTagger("spacy-tagger")
#doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.")
#print(tagger.tag("Ich werde morgen in die Schule gehen."))
#print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
コード例 #15
0
def pos_tag(to_tag, stanford_postagger_path):
    '''tag the tokens with part of speech; to_tag is the tags; model_path is the 
    file path to the stanford POS tagger model; and jar_path to the Stanford POS 
    tagger jar file'''
    pos_tagger = StanfordPOSTagger(stanford_postagger_path +"\\models\\french.tagger",
                                   stanford_postagger_path +"\\stanford-postagger.jar",
                                   encoding='utf8') #create an object of class POSTagger that is encoded in UTF-8
    tags = pos_tagger.tag(to_tag) #run the tagging algorithm on the tokenized raw text
    return tags
コード例 #16
0
def getTags(sen_arr):
    tag_arr = []
    st = StanfordPOSTagger('english-left3words-distsim.tagger')
    res = st.tag(sen_arr)
    for i in res:
        tag = i[1].encode("utf-8")
        tag_arr.append(tag)

    return tag_arr
コード例 #17
0
    def determine_sentpos_by_nltk(self, sentence):
        '''
			get pos collection for sentence from nltk
		'''
        pos_model_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/models/english-bidirectional-distsim.tagger"
        pos_jar_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/stanford-postagger.jar"
        pos = StanfordPOSTagger(model_filename=pos_model_file,
                                path_to_jar=pos_jar_file)
        return pos.tag(sentence.split(" "))
コード例 #18
0
def pos_tagger(text):
    from nltk.tag.stanford import StanfordPOSTagger
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )
    english_postagger.java_options = '-mx4096m'
    tags = english_postagger.tag(text)
    return tags
コード例 #19
0
def posInput(text):
	print("POS")
	path_to_model = "./stanford-postagger/models/english-caseless-left3words-distsim.tagger"
	path_to_jar = "./stanford-postagger/stanford-postagger.jar"
	tagger=StanfordPOSTagger(path_to_model, path_to_jar)
	tagger.java_options='-mx4096m'          ### Setting higher memory limit for long sentences
	# sentence = 'THIS IS TESTING'
	result = tagger.tag(word_tokenize(text))
	# print result
	return result
コード例 #20
0
    def tagger(self):
        self.tokenize(self.taggerUse)
        if self.taggerUse == 'standford':
            tagger = StanfordPOSTagger('/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/models/spanish-distsim.tagger',
                               '/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/stanford-postagger-3.8.0.jar')
            tagged_sents = tagger.tag_sents(self.clean_corpus)
        else:
            tagged_sents = self.nlp.pipe(self.clean_corpus, n_threads=8)
            tagged_sents = self.proccess_spacy(tagged_sents)

        return self.dig2num(tagged_sents)
コード例 #21
0
def transform_wnli(premise,hypothesis):
    cased_premise=premise
    premise=[w.lower() for w in nltk.word_tokenize(premise)]

    #transform WNLI examples back into WSC format
    hypothesis = [w.lower() for w in nltk.word_tokenize(hypothesis)]
    best_target=["","","","","",""]#should get overwritten
    best_masked_s=[]
    for l in range(len(hypothesis)):
        for r in range(l+1,l+6):
            left_part = hypothesis[:l]
            right_part = hypothesis[r:]
            pattern = left_part + ["_"]+ right_part
            for s in range(len(premise)):
                ok=True
                if s+len(pattern)>len(premise):
                    break
                for a,b in zip(pattern,premise[s:s+len(pattern)]):
                    if a=="_":
                        continue
                    if a==b:
                        continue
                    if a in [',','.','?','!'] and b in [',','.','?','!']:#punctuation is ignored
                        continue
                    ok=False
                    break
                if ok and len(hypothesis[l:r])<=len(best_target):
                    best_target = hypothesis[l:r]
                    best_masked_s = premise[:s]+pattern+premise[s+len(pattern):]
    if len(best_masked_s)==0:#We failed
        return None,None
    #We extracted the masked sentence from the premise.
    global POS_tagger
    if POS_tagger is None:
        os.environ['STANFORD_MODELS'] = "stanford-postagger-2018-10-16/models"
        os.environ['CLASSPATH'] = "stanford-postagger-2018-10-16"
        POS_tagger = StanfordPOSTagger("stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger")
    tagged_premise = POS_tagger.tag(nltk.word_tokenize(cased_premise))
    candidates = []
    current=[]
    for word,tag in tagged_premise:
        if tag in ["NN","NNS","NNP","NNPS"]:
            current.append(word)
        else:
            if current!=[]:
                candidates.append(" ".join(current).lower())
                current=[]
    if current!=[]:
        candidates.append(" ".join(current).lower())
    best_target=" ".join(best_target)
    candidates=[c for c in candidates if c.find(best_target)==-1 and best_target.find(c)==-1]
    candidates = [best_target]+candidates
    found_sentence = " ".join(best_masked_s).replace(" n't","n't").replace(" 's","'s")#Sorry nltk
    return found_sentence,candidates
コード例 #22
0
 def tag(tokens):
     #java_path = "C:/Program Files/Java/jdk1.8.0_31/bin/java.exe"
     #os.environ['JAVAHOME'] = java_path
     special_symbols_array = ["the", "a", "an"]
     english_postagger = StanfordPOSTagger(
         'tagger/english-bidirectional-distsim.tagger',
         'tagger/stanford-postagger.jar')
     token_tag_array = english_postagger.tag(tokens)
     for element in token_tag_array:
         if element[0].lower() in special_symbols_array:
             token_tag_array.remove(element)
     return token_tag_array
コード例 #23
0
	def __init__(self):
		
		# user need to download Stanford Parser, NER and POS tagger from stanford website
		self.constituent_parse_tree = StanfordParser()  #user need to set as environment variable
		self.stanford_dependency = StanfordDependencyParser() #user need to set as environment variable
		self.lemma = WordNetLemmatizer()
		self.home = '/home/ramesh'
		#user needs to download stanford packages and change directory
		self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
		self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
		self.CharacterOffsetEnd = 0 
		self.CharacterOffsetBegin = 0
コード例 #24
0
	def __init__(self):
		
		# print "Inside ntlk util"
		self.constituent_parse_tree = StanfordParser()
		self.stanford_dependency = StanfordDependencyParser()
		self.lemma = WordNetLemmatizer()
		self.home = '/home/ramesh/Documents/mas_course/second_semester/rnd/rnd_submission_cd'
		self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar')
		self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar')
		self.CharacterOffsetEnd = 0 
		self.CharacterOffsetBegin = 0
		self.contractions = {"'nt":"not", "'ll": " will", "'re":"are", "'ve":"have", "'m":"am"}
コード例 #25
0
    def create_pos(self, tweet):
        self.pos_tweet = None

        tweet = word_tokenize(tweet.lower())

        english_pos = StanfordPOSTagger(
            'postagger/models/english-bidirectional-distsim.tagger',
            'postagger/stanford-postagger.jar')

        self.pos_tweet = english_pos.tag(tweet)

        return self.pos_tweet
コード例 #26
0
	def __init__(self, posTagger, filePath, saveExtension, loadExtension, plainExtension, contextLength, replacementFrame, replacementFrameExtensions, replacementCount):
		self.posTagger_Model = posTagger + "/models/english-bidirectional-distsim.tagger"
		self.posTagger_Jar = posTagger + "/stanford-postagger.jar"
		self.filePath = filePath
		self.savePath = filePath + saveExtension
		self.loadPath = filePath + loadExtension
		self.plainPath = filePath + plainExtension
		self.contextLength = contextLength
		self.replacementFrame = replacementFrame
		self.replacementFrameExtensions = replacementFrameExtensions
		self.replacementCount = replacementCount
		self.allTokens = []
		self.tagger = StanfordPOSTagger(self.posTagger_Model, self.posTagger_Jar, java_options='-mx1000m -Xmx1028m')
コード例 #27
0
    def set_pos_tagger(self, model_path=POS_MODEL, jar_path=POS_JAR):
        """Setup path for Standford POS tagger. Default value is configured in
        cfg.py

        Args:
            model_path: path to the trained model
            jar_path: path to the JAR
        """
        model_path = os.path.join(self.lib_path, model_path)
        jar_path = os.path.join(self.lib_path, jar_path)
        if os.path.isfile(model_path) and os.path.isfile(jar_path):
            self.pos_tagger = StanfordPOSTagger(model_path, jar_path)
        else:
            raise IOError('Cannot find POS tagging lib')
コード例 #28
0
 def _POS(self, txt, id):
     self.df[['ID', 'pos']].to_csv('pos_ner.csv', sep='\t')
     path_pos = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
     model_path = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger'
     from nltk.tag.stanford import StanfordPOSTagger
     tagger = StanfordPOSTagger(model_path, path_pos)
     tagger.java_options = '-mx8096m'  ### Setting higher memory limit for long sentences
     tokens = nltk.word_tokenize(txt)
     pos_res = tagger.tag(tokens)
     filepath = '/home/ise/NLP/NLP3/pos/pos_{}.txt'.format(id)
     with open(filepath, 'w') as file_handler:
         for item in pos_res:
             file_handler.write("{}\n".format(item))
     return pos_res
コード例 #29
0
def extractPOS(inputFile_data, inputFile_tags, inputFile_version,
               outputFile_pos):
    f = open(inputFile_tags)
    allTags = set(f.read().split(","))  # Load all tags
    f.close()

    f = open(inputFile_version)
    lines = f.readlines()
    f.close()
    tag_version = []  # tags with version number
    for index, row in enumerate(lines):
        items = row.strip().split()
        if items[0] in allTags:
            for tag in items[1].split(","):
                tag_version.append(tag)

    print "The number of tag_version is: ", len(tag_version)
    tag_version = set(tag_version)

    fw_pos = open(outputFile_pos, "w")
    english_postagger = StanfordPOSTagger(
        '/Users/songshuaichen/Downloads/jars/models/english-bidirectional-distsim.tagger'
    )
    f = open(inputFile_data)
    lines = f.readlines()
    f.close()

    for index, row in enumerate(lines):
        if index % 300 == 0:
            print index, " Finish ", float(index) / len(lines)
        items = row.strip().split("		")

        # if index >=5000 and index < 6000 and items[0] in tag_version:
        if items[0] in tag_version:
            fw_pos.write(str(index) + "	" + items[0] + "	\n")
        if items[0] not in tag_version:

            fw_pos.write(str(index) + "	" + items[0] + "	")
            if len(items) > 1:
                text = items[1].split(". ")[0].decode('utf-8')
                pos = english_postagger.tag(text.split())

                for p in pos:
                    fw_pos.write(str(p))
                    fw_pos.write("	")

            fw_pos.write("\n")

    fw_pos.close()
コード例 #30
0
ファイル: transform.py プロジェクト: rayansamy/Notable
def transform_to_pos(text):
    import os
    #os.environ['JAVAHOME'] = java_path
    from nltk.corpus import sentiwordnet as swn
    from nltk.tag.stanford import StanfordPOSTagger
    from nltk import word_tokenize

    path_to_model = "./postagging/english-bidirectional-distsim.tagger"
    path_to_jar = "./postagging/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences
    tokens = word_tokenize(text)
    size = len(tokens)
    from collections import Counter
    pos = tagger.tag(tokens)
    counts = Counter(tag for word, tag in pos)
    for key in counts:
        counts[key] /= size
    counts["totalWordsCount"] = size
    counts[";"] = tokens.count(";") / size
    counts["questionmarks"] = tokens.count("?") / size
    counts["exclamationmarks"] = tokens.count("!") / size
    counts["Quotes"] = tokens.count("\"") / size
    try:
        counts.pop(".")
    except:
        pass
    from collections import OrderedDict
    ot = [
        'NNP', 'VBD', 'VBN', 'IN', 'CD', 'VBP', ',', 'DT', 'NN', 'JJ', 'RB',
        'TO', 'SYM', 'PRP', 'NNS', 'CC', 'PRP$', 'POS', 'FW', 'VBG', ':',
        'WRB', 'EX', 'JJR', 'WDT', 'totalWordsCount', ';', 'questionmarks',
        'exclamationmarks', 'Quotes'
    ]
    counts = OrderedDict(counts)
    for key in ot:
        if key in counts:
            pass
        else:
            counts[key] = 0
    tmp = counts.copy()
    for key in tmp:
        if key not in ot:
            counts.pop(key, None)
    dab = {}
    for i in ot:
        dab[i] = counts[i]
    counts = dab.copy()
    return counts
コード例 #31
0
def build_question_set():
    sv_file = 'data/kprestval_pos_tags.json'
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    meta = load_and_process_metadata('val')
    images = split_data_by_seed(meta, 'kprestval')
    num = len(images)
    pos_tags_dict = {}
    for i, info in enumerate(images):
        question_id = info.question_id
        question = info.question.lower()
        _pos_tags = st.tag(word_tokenize(question))
        pos_tags_dict[question_id] = _pos_tags
        print('\nPOS TAGGER: %d/%d' % (i, num))
        print(_pos_tags)
    save_json(sv_file, {'pos_tags': pos_tags_dict})
コード例 #32
0
 def __init__(self):
     self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger',
                                        self.modeldir + '/postagger/stanford-postagger.jar')
     self._stemmer = nltk.SnowballStemmer("english")
     self._stopwords = stopword(self.stopword_path)
     self._type_words = self._set_type_words()
     self._sentiment = self._get_sentiment()
コード例 #33
0
	def __init__(self, text):
		reload(sys)  
		sys.setdefaultencoding('utf8')

		self.text = text

		root = os.path.dirname(os.path.realpath(__file__))
		os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar"
		os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/"
		
		
		_path_to_model  = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
		_path_to_jar    = root + '/stanford-postagger/stanford-postagger.jar'
		self.stanford   = StanfordPOSTagger(_path_to_model, _path_to_jar)

		self.sentences  = sent_tokenize(text.encode("utf-8"))
		self.words      = word_tokenize(text.encode("utf-8"))
		

		self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) ))
		#cleanWords = self.cleanWords()
		#self.tags = self.stringifyTuples(self.stanford.tag( cleanWords ))
		#print self.cleanWords()
		
		self.taggedBigrams = self.ngramsAndTags(2) 
コード例 #34
0
ファイル: POSTagger.py プロジェクト: mdth/Masterarbeit
    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
        else:
            raise Exception("Wrong tagger parameter.")
コード例 #35
0
ファイル: posngrams.py プロジェクト: dpritsos/DoGSWrapper
class String2POSNGramsList(String2TokenList):

    def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'):

        # Other Taggers:
        #   1. 'english-bidirectional-distsim.tagger'
        #   2. 'english-left3words-distsim.tagger'

        super(String2POSNGramsList, self).__init__()

        # N-Grams size
        self.n = n

        # Tagger Class Selection... See detail in Stanford Tagger documentation.
        self.tagger_cls = tagger_cls

        # Getting the Stanford tagger instance.
        self.spt = StanfordPOSTagger(self.tagger_cls)
        # self.spt = CoreNLPPOSTagger(url='http://localhost:9000')
        self.spt.java_options = '-mx10g'

    @property
    def N(self):
        return self.n

    @N.setter
    def N(self, value):
        self.n = value

    @property
    def Tagger_cls(self):
        return self.n

    @Tagger_cls.setter
    def Tagger_cls(self, value):
        self.tagger_cls = value

    def terms_lst(self, text):

        # Getting the Analysed list of tokens.
        analyzed_terms_lst = self.token_lst(text)

        # Tagging the Analyzed terms list and getting the tags list as terms.
        pos_tags = [pos for t, pos in self.spt.tag(analyzed_terms_lst)]

        # Constructing the Words N-Grams List
        analyzed_terms_lst = [
            " ".join(pos_tags[i: i+self.n])
            for i in range(len(pos_tags) - self.n + 1)
        ]

        return analyzed_terms_lst
コード例 #36
0
def get_pos_sentence(sentences_spans,pos_vocab):
    """
    Get POS tags for each sentence. (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    #raw_dir_simple = read.read_from_json('test/test_dir_simple')   #### in folder data/
    #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples')
    #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple')

    #raw_dir_simple = ["NYT19980206.0466"]
    english_postagger = StanfordPOSTagger(
        StandforParser,    #### in folder data/
        StandforParser_jar) #### in folder data/
    english_postagger.java_options = '-mx8000m'
    pos_sentences = list()

    for sent_span in sentences_spans:
        print(sent_span[0])
        text = nltk.word_tokenize(sent_span[0])
        text_pos = english_postagger.tag(text)   #####StanfordPnOSTagger failed to tag the underscore, see https://github.com/nltk/nltk/issues/1632  if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues

        index = 0
        for token in text_pos:
            # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
            #     text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] == "``"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "``"]
            if text[index] ==token[0] and token[0] == "''"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] in ['{','(','['] :
                text_pos[index] = [token[0],"("]
            if text[index] == token[0] and token[0] in ['}',')',']']:
                text_pos[index] = [token[0],")"]
            pos_vocab[token[1]]+=1
            index+=1
        pos_sentences.append(text_pos)
    return pos_sentences,pos_vocab
コード例 #37
0
ファイル: selectors.py プロジェクト: pombredanne/phd-backup
	def __init__(self, pos_model, stanford_tagger, java_path):
		"""
		Creates a POSTagSelector instance.
	
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		"""
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
コード例 #38
0
ファイル: selectors.py プロジェクト: pombredanne/phd-backup
	def __init__(self, condprob_model, pos_model, stanford_tagger, java_path):
		"""
		Creates a POSTagSelector instance.
	
		@param condprob_model: Path to a binary conditional probability model.
		For instructions on how to create the model, please refer to the LEXenstein Manual.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		"""
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
		self.model = pickle.load(open(condprob_model, 'rb'))
コード例 #39
0
ファイル: posngrams.py プロジェクト: dpritsos/DoGSWrapper
    def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'):

        # Other Taggers:
        #   1. 'english-bidirectional-distsim.tagger'
        #   2. 'english-left3words-distsim.tagger'

        super(String2POSNGramsList, self).__init__()

        # N-Grams size
        self.n = n

        # Tagger Class Selection... See detail in Stanford Tagger documentation.
        self.tagger_cls = tagger_cls

        # Getting the Stanford tagger instance.
        self.spt = StanfordPOSTagger(self.tagger_cls)
        # self.spt = CoreNLPPOSTagger(url='http://localhost:9000')
        self.spt.java_options = '-mx10g'
コード例 #40
0
    def __init__(self, text):
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.text = text

        root = os.path.dirname(os.path.realpath(_file_))
        os.environ["STANFORD_PARSER"] = root+
        os.environ["STANFORD_MODELS"] = root+
        _path_to_model = root + ''
        _path_to_jar = root + ''
        self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar)
        self.sentences = sent_tokenize(text.encode("utf-8"))
        self.words = word_tokenize(text.encode("utf-8"))

        self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower())))
        #cleanWords

        self.taggedBigrams = self.ngramsAndTags(2)
コード例 #41
0
ファイル: selectors.py プロジェクト: pombredanne/phd-backup
	def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'):
		"""
		Creates an instance of the WordVectorSelector class.
	
		@param vector_model: Path to a binary word vector model.
		For instructions on how to create the model, please refer to the LEXenstein Manual.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		@param pos_type: The type of POS tags with which the model's words are annotated, if any.
		Values supported: none, treebank, paetzold
		"""
		self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True)
		self.pos_type = pos_type
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
コード例 #42
0
ファイル: selectors.py プロジェクト: pombredanne/phd-backup
class POSTagSelector:

	def __init__(self, pos_model, stanford_tagger, java_path):
		"""
		Creates a POSTagSelector instance.
	
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		"""
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)

	def selectCandidates(self, substitutions, victor_corpus):
		"""
		Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
	
		@param substitutions: Candidate substitutions to be filtered.
		It can be in two formats:
		A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
		Example: substitutions['perched'] = {'sat', 'roosted'}
		A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
		Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
		@param victor_corpus: Path to a corpus in the VICTOR format.
		For more information about the file's format, refer to the LEXenstein Manual.
		@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
		"""
		selected_substitutions = []

		substitution_candidates = []
		if isinstance(substitutions, list):
			substitution_candidates = substitutions
		elif isinstance(substitutions, dict):
			void = VoidSelector()
			substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
		else:
			print('ERROR: Substitutions are neither a dictionary or a list!')
			return selected_substitutions
		
		#Read VICTOR corpus:
		lexf = open(victor_corpus)
		sents = []
		targets = []
		heads = []
		words = set([])
		c = -1
		for line in lexf:
			c += 1
			data = line.strip().split('\t')
			sent = data[0].strip().split(' ')
			target = data[1].strip()
			head = int(data[2].strip())
			sents.append(sent)
			targets.append(target)
			heads.append(head)
			words.update(set(substitution_candidates[c]))
		lexf.close()
		
		#Tag sentences:
		tagged_sents = self.tagger.tag_sents(sents)
		
		#Tag words:
		words = list(words)
		words_sents = [[w] for w in words]
		tagged_words = self.tagger.tag_sents(words_sents)
		word_to_tag = {}
		for i in range(0, len(words)):
			word_to_tag[words[i]] = tagged_words[i][0][1]
		
		for i in range(0, len(sents)):
			target = targets[i]
			head = heads[i]
			target_pos = str(tagged_sents[i][head][1])
		
			candidates = []
			candidates = set(substitution_candidates[i])
			candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos)
		
			selected_substitutions.append(candidates)
		lexf.close()
		return selected_substitutions
	
	def getTargetPOS(self, sent, target, head):
		pos_data = []
		try:
			pos_data = nltk.pos_tag(sent)
			return pos_data[head][1]
		except UnicodeDecodeError:
			try:
				pos_data = nltk.pos_tag(target)
				return pos_data[0][1]
			except UnicodeDecodeError:
				return 'None'
			
		
	def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos):
		result = set([])
		for candidate in candidates:
			if candidate in word_to_tag:
				ctag = word_to_tag[candidate]
				if ctag==target_pos:
					result.add(candidate)
		return result
	
	def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False):
		"""
		Saves a set of selected substitutions in a file in VICTOR format.
	
		@param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected.
		@param substitutions: The vector of substitutions selected for the VICTOR corpus.
		@param output_path: The path in which to save the resulting VICTOR corpus.
		@param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution.
		"""
		o = open(output_path, 'w')
		f = open(victor_corpus)
		for subs in substitutions:
			data = f.readline().strip().split('\t')
			sentence = data[0].strip()
			target = data[1].strip()
			head = data[2].strip()
			
			newline = sentence + '\t' + target + '\t' + head + '\t'
			for sub in subs:
				newline += '0:'+sub + '\t'
			o.write(newline.strip() + '\n')
		f.close()
		o.close()
コード例 #43
0
ファイル: userInput.py プロジェクト: Fragipani/EZlearner
# /usr/local/bin/python
# coding: latin-1

import nltk, string, os

from random import randint
from nltk.tag.stanford import StanfordPOSTagger

os.environ['CLASSPATH'] = '../ressources/standforPOS/stanford-postagger.jar'
os.environ['STANFORD_MODELS'] = '../ressources/standforPOS/models'

st = StanfordPOSTagger('german-hgc.tagger')

#tagset: http://www.coli.uni-saarland.de/projects/sfb378/negra-corpus/stts.asc



##############################
# Text importieren
##############################

f = open("../ressources/reiseberichtIndien.txt")
raw_text = f.read()
start_bookmark = raw_text.find("Erstes Kapitel")
end_bookmark = raw_text.rfind("Im Verlag von R")
text = raw_text[start_bookmark:end_bookmark]



##############################
#  Tokenization des Textes
コード例 #44
0
	elif tag.startswith('J'):
		result = 'J'
	elif tag.startswith('W'):
		result = 'W'
	elif tag.startswith('PRP'):
		result = 'P'
	else:
		result = tag.strip()
	return result

model = '/export/data/ghpaetzold/benchmarking/lexmturk/scripts/evaluators/stanford-postagger-full-2015-04-20/models/english-bidirectional-distsim.tagger'
tagger = '/export/data/ghpaetzold/benchmarking/lexmturk/scripts/evaluators/stanford-postagger-full-2015-04-20/stanford-postagger.jar'
java = '/usr/bin/java'

os.environ['JAVAHOME'] = java
tagger = StanfordPOSTagger(model, tagger, java_options='-Xmx6g')

f = open('ratings.txt')

sents1 = []
sents2 = []
heads1 = []
heads2 = []
for line in f:
	data = line.strip().split('\t')
	word1 = data[1].strip()
	word2 = data[3].strip()
	sent1 = data[5].strip()
	newsent1 = ''
	tokens = sent1.split(' ')
	index1 = -1
コード例 #45
0
ファイル: selectors.py プロジェクト: pombredanne/phd-backup
class WordVectorSelector:
	
	def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'):
		"""
		Creates an instance of the WordVectorSelector class.
	
		@param vector_model: Path to a binary word vector model.
		For instructions on how to create the model, please refer to the LEXenstein Manual.
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		@param pos_type: The type of POS tags with which the model's words are annotated, if any.
		Values supported: none, treebank, paetzold
		"""
		self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True)
		self.pos_type = pos_type
		os.environ['JAVAHOME'] = java_path
		self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
	
	def selectCandidates(self, substitutions, victor_corpus, proportion=1.0, proportion_type='percentage', stop_words_file=None, window=99999, onlyInformative=False, keepTarget=False, onePerWord=False):
		"""
		Selects which candidates can replace the target complex words in each instance of a VICTOR corpus.
	
		@param substitutions: Candidate substitutions to be filtered.
		It can be in two formats:
		A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions.
		Example: substitutions['perched'] = {'sat', 'roosted'}
		A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector.
		Example: [['sat', 'roosted'], ['easy', 'uncomplicated']]
		@param victor_corpus: Path to a corpus in the VICTOR format.
		For more information about the file's format, refer to the LEXenstein Manual.
		@param proportion: Percentage of substitutions to keep.
		If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1.
		If proportion_type is set to "integer", then this parameter must be an integer number.
		@param proportion_type: Type of proportion to be kept.
		Values supported: percentage, integer.
		@param stop_words_file: Path to the file containing stop words of the desired language.
		The file must contain one stop word per line.
		@param window: Number of tokens around the target complex sentence to consider as its context.
		@param onlyInformative: If True, only content words are considered as part of the complex word's context, such as nouns, verbs, adjectives and adverbs.
		@param keepTarget: If True, the complex target word is also included as part of its context.
		@param onePerWord: If True, a word in the complex word's context can only contribute once to its resulting word vector.
		@return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus.
		"""
		#Initialize selected substitutions:
		selected_substitutions = []
		
		#Read stop words:
		stop_words = set([])
		if stop_words_file != None:
			stop_words = set([word.strip() for word in open(stop_words_file)])

		#Configure input:
		substitution_candidates = []
		if isinstance(substitutions, list):
			substitution_candidates = substitutions
		elif isinstance(substitutions, dict):
			void = VoidSelector()
			substitution_candidates = void.selectCandidates(substitutions, victor_corpus)
		else:
			print('ERROR: Substitutions are neither a dictionary or a list!')
			return selected_substitutions		

		#Parse sentences:
		lexf = open(victor_corpus)
		sents = [line.strip().split('\t')[0].strip().split(' ') for line in lexf]
		lexf.close()
		tagged_sents = self.tagger.tag_sents(sents)
		
		#Transform them to the right format:
		if self.pos_type=='paetzold':
			transformed = []
			for sent in tagged_sents:
				tokens = []
				for token in sent:
					tokens.append((token[0], getGeneralisedPOS(token[1])))
				transformed.append(tokens)
			tagged_sents = transformed
		
		#Rank candidates:
		c = -1
		lexf = open(victor_corpus)
		for line in lexf:
			c += 1
			data = line.strip().split('\t')
			sent = data[0].strip()
			target = data[1].strip()
			head = int(data[2].strip())
			pos_tags = tagged_sents[c]
			target_pos = pos_tags[head][1]
		
			target_vec = self.getSentVec(sent, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tags)
			candidates = substitution_candidates[c]

			candidate_dists = {}
			for candidate in candidates:
				candidate_vec = self.getWordVec(candidate, target_pos)
				try:
					candidate_dists[candidate] = cosine(candidate_vec, target_vec)
				except ValueError:
					candidate_dists = candidate_dists

			final_candidates = self.getFinalCandidates(candidate_dists, proportion, proportion_type)

			selected_substitutions.append(final_candidates)
		lexf.close()
		return selected_substitutions
		
	def getSentVec(self, sentence, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tokens):
		informative_tags = set([])
		if onlyInformative:
			if self.pos_type=='treebank':
				informative_tags = set(['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS'])
			if self.pos_type=='paetzold':
				informative_tags = set(['N', 'V', 'J', 'R'])
		
		tokens = sentence.split(' ')
		
		valid_tokens = []
		if keepTarget:
			valid = tokens[head].strip()
			if self.pos_type!='none':
				valid += '|||' + pos_tokens[head][1]
			valid_tokens.append(valid)
		
		if head>0:
			for i in range(max(0, head-window), head):
				if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags:
					if tokens[i] not in stop_words:
						valid = tokens[i]
						if self.pos_type!='none':
							valid += '|||' + pos_tokens[i][1]
						valid_tokens.append(valid)
		
		if head<len(tokens)-1:
			for i in range(head+1, min(len(tokens), head+1+window)):
				if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags:
					if tokens[i] not in stop_words:
						valid = tokens[i]
						if self.pos_type!='none':
							valid += '|||' + pos_tokens[i][1]
						valid_tokens.append(valid)
						
		if onePerWord:
			valid_tokens = list(set(valid_tokens))
		
		result = []
		for	token in valid_tokens:
			if len(result)==0:
				try:
					result = self.model[token]
				except Exception:
					result = []
			else:
				try:
					result = np.add(result, self.model[token])
				except Exception:
					result = result
		result = result/float(len(valid_tokens))
		return result
		
	def getWordVec(self, candidate, target_pos):
		cand = None
		if self.pos_type!='none':
			cand = candidate + '|||' + target_pos
		else:
			cand = candidate

		result = np.array([])
		try:
			result = self.model[cand]
		except Exception:
			pass
		return result
				
	def getFinalCandidates(self, candidate_dists, proportion, proportion_type):
		result = sorted(list(candidate_dists.keys()), key=candidate_dists.__getitem__)
		if proportion_type=='percentage':
			return result[0:max(1, int(proportion*float(len(result))))]
		elif proportion_type=='integer':
			if proportion>=len(result):
				return result
			else:
				return result[0:max(1, int(proportion))]
		else:
			print('Unrecognized proportion type.')
			return result
		
	def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False):
		"""
		Saves a set of selected substitutions in a file in VICTOR format.
	
		@param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected.
		@param substitutions: The vector of substitutions selected for the VICTOR corpus.
		@param output_path: The path in which to save the resulting VICTOR corpus.
		@param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution.
		"""
		o = open(output_path, 'w')
		f = open(victor_corpus)
		for subs in substitutions:
			data = f.readline().strip().split('\t')
			sentence = data[0].strip()
			target = data[1].strip()
			head = data[2].strip()
			
			newline = sentence + '\t' + target + '\t' + head + '\t'
			for sub in subs:
				newline += '0:'+sub + '\t'
			o.write(newline.strip() + '\n')
		f.close()
		o.close()
コード例 #46
0
ファイル: featExtrWithSmor.py プロジェクト: cyriaka90/MyFiles
## This code extracts the features for several glosses and stores it in two text files to be fed to evaluation. py or predictGoodness.py

## import everything needed
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from nltk.parse.stanford import StanfordParser
from nltk.tag.stanford import StanfordPOSTagger
import string
from pattern.de import singularize
import subprocess
import os

## set variables
parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz")
st = StanfordPOSTagger('german-dewac.tagger')
featuresPhrases = []
finalRatings = []
count=0
path = '/home/hanna/Documents/SMOR/'

## read in the word frequencies from DeReWo
derewo = open('derewo-v-ww-bll-320000g-2012-12-31-1.0.txt')
freqWo= []
freqNo= []
for lines in derewo:
	lines = lines.strip()               
	parts = lines.split(" ")
	freqWo.append(parts[0].lower())
	freqNo.append(int(float(parts[1])))
コード例 #47
0
ファイル: SenticParser.py プロジェクト: RTsGIT/LocalSentic
	def __init__(self):
		self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')
コード例 #48
0
ファイル: SenticParser.py プロジェクト: RTsGIT/LocalSentic
class SenticParser:
	def __init__(self):
		self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')

	def TaggedSentenceSlashForm(self, sentence ):

		#print sentence.split()
		Tagged = self.st.tag(sentence.split())

		TaggedSentence = ""
		for i in Tagged:
			TaggedSentence = TaggedSentence+"/".join(i)+" "


		#print TaggedSentence
		return TaggedSentence


	def TaggedSentence(self, sentence ):
		Tagged = self.st.tag(sentence.split())
		return Tagged


	def FindStemmedVerb(self, word):
		st = LancasterStemmer()
		StemmedVerb = st.stem(word)
		
		dic = enchant.Dict("en_US")
		if( dic.check(StemmedVerb) ):
			return StemmedVerb
		else:
			return StemmedVerb+"e"			
	

	def FindSplit(self, sentence, TaggedSentence):
		TokenizedSentence = nltk.word_tokenize(sentence)

		SplitList = []
		SentAdded = ""
		split = 0 

		#print TaggedSentence

		for i in range(len(TaggedSentence)):
			if TaggedSentence[i][1].startswith("VB"):
				SplitList.append(SentAdded)
				try:
					if (TaggedSentence[i+1][1].startswith("VB")):
						SentAdded = ""
					else:
						SplitList.append(SentAdded)
						SentAdded = TaggedSentence[i][0]+" "
					#	print "split"
				except:
					SplitList.append(TaggedSentence[i][0]) 
				
			else:
				#print SentAdded
				SentAdded = SentAdded + TokenizedSentence[i] + " "
							
		SplitList.append(SentAdded)		
	

		Str_list = filter(None, SplitList)
		Str_list = list(set(Str_list))

		'''
		for i in range(len(Str_list)):
			Str_list[i] = Str_list[i][:-1].translate(string.maketrans("",""), string.punctuation)
		'''
		return Str_list
コード例 #49
0
'''
Created on Mar 11, 2016

@author: zhongzhu
'''
import os

from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger


st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
st.tag('What is the airspeed of an unladen swallow ?'.split())

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))

dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
コード例 #50
0
ファイル: ParseBigrams.py プロジェクト: RTsGIT/LocalSentic
class Parser:

    def __init__(self):
        self.MatchList = []
        self.ConceptMatches = []
        self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')


    def SyntacticMatch(self, concept1, concept2 ):                      # Checks for syntactic similarity. Checks for matching words between two concepts. 
	TaggedConcept1 = self.st.tag(nltk.word_tokenize(concept1))
	TaggedConcept2 = self.st.tag(nltk.word_tokenize(concept2))     

	print TaggedConcept1
	print TaggedConcept2

	flag = 0 

	for i in TaggedConcept1:
		for j in TaggedConcept2:
			if (i == j):
				if i[1].startswith("NN"):
					flag = 1
	

	if ( flag == 1):
		return True
	else:
		return False






    def FindBigrams(self, concept):                                      # Finds All Bigrams associated with the concept
        #sentence = concept.split(" ")                     	         # Splits the Given concept into Bigrams     e.g) "a very special christmas gift" gets split as ["a very", "very special", "special 																	christmas", "christmas gift"]

	sentence = self.st.tag(nltk.word_tokenize(concept))        
 
	print sentence

	Bigrams = []										
  
	for i in range(len(sentence) - 1):
            if ( sentence[i][1] == "JJ"  and sentence[i+1][0] in stopwords.words('english') ):		# If the bigram is [ adj + stopword ] , ignore
                continue 									           # bigrams like "a very" are ignored
	    
	    elif ( sentence[i][0] in stopwords.words('english')  and sentence[i+1][0] in stopwords.words('english') ):		# If the bigram is [ adj + stopword ] , ignore
                continue              


            elif ( sentence[i+1][1] == "JJ"  and sentence[i][0] in stopwords.words('english') ):            # If the bigram is [ stopword + adj ] , ignore 
                continue									           # bigrams like "amazingly a" is ignored

            elif ( sentence[i][1] == "JJ" and sentence[i+1][1].startswith("NN") ):                       # If the bigram is [ adj + concept ] , then include [adj + concept] and [concept] to the list
                Bigrams.append(sentence[i+1][0])						 # e.g) "special christmas" --> concepts extracted will be "special christmas" and "christmas" are added
                Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0])
                
            elif ( sentence[i][0] in stopwords.words("english") and sentence[i+1][1].startswith("NN") ):       # If the bigram is [ stopword + concept ], then inlcude only the concept w/ and w/o the concept 
                    Bigrams.append(sentence[i+1][0])                                                                 # e.g) "the christmas" --> concepts that will be extracted is "christmas" , "the christmas"
		    Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0])						          
           
	    elif ( sentence[i][1].startswith("NN") and sentence[i+1][1] == "JJ" ):							       # If the bigram ends with adjective , then ignore the adjective. 
                Bigrams.append(sentence[i][0])    							              # e.g) "present amazing" --> concept that will be extracted is "present"
                
            elif ( sentence[i][1].startswith("NN") and sentence[i+1][0] in stopwords.words("english")):					# If the bigram ends with a stopword , then ignore the stopword
                    Bigrams.append(sentence[i][0])							              # e.g) "christmas the" --> concept that will be extracted is "christmas"
             
            else:	
                Bigrams.append(sentence[i][0]+ " "+ sentence[i+1][0])
                   
                     
        print Bigrams

        return Bigrams
コード例 #51
0
class NltkHelper:

	def __init__(self, text):
		reload(sys)  
		sys.setdefaultencoding('utf8')

		self.text = text

		root = os.path.dirname(os.path.realpath(__file__))
		os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar"
		os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/"
		
		
		_path_to_model  = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
		_path_to_jar    = root + '/stanford-postagger/stanford-postagger.jar'
		self.stanford   = StanfordPOSTagger(_path_to_model, _path_to_jar)

		self.sentences  = sent_tokenize(text.encode("utf-8"))
		self.words      = word_tokenize(text.encode("utf-8"))
		

		self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) ))
		#cleanWords = self.cleanWords()
		#self.tags = self.stringifyTuples(self.stanford.tag( cleanWords ))
		#print self.cleanWords()
		
		self.taggedBigrams = self.ngramsAndTags(2) 

		#print self.words
		#print self.cleanWords()
		
		#print "Bigrams --> ", self.taggedBigrams
		#print "Tags --> ", self.findTags()
		
		#print (nouns)
	
	def personal_names(self):
		output = []

		#(('reports', 'NNS'), ('claim', 'VBP'))
		for gram in self.taggedBigrams:
			tag1  = gram[0][1]
			tag2  = gram[1][1]
			word1 = gram[0][0]
			word2 = gram[1][0]

			if self.isPersonalName( tag1 ) and self.isPersonalName( tag2 ):
				output.append( "{0} {1}".format(word1, word2) )
		return output

	def isPersonalName(self, tag):
		return tag == "NNP" or tag == "FW"

	
	def preprocessTitle(self):
		
		output = ''
		for taggedWord in self.tags:
			
			word = taggedWord[0]
			tag  = taggedWord[1]

			if self.isPersonalName(tag):
				output = "{0} {1}".format(output, word.title())
			else:
				output = "{0} {1}".format(output, word.lower())


		return output
	
	def ngramsAndTags(self, n):
		output = []
		for i in range(len(self.tags)-n+1):
			gram = (self.tags[i],)
			for j in range(i+1, i+n):
				gram += ( self.tags[j], )
	    		output.append( gram )

		return output
	


	def sortFrequencies( self, ngram ):
		return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)		    
    

	
	def findTags(self):
		#pattern = [("AJ", NOUN/S/FWS), (FW, FW), NOUN, NOUN]
		output = []

		#(('reports', 'NNS'), ('claim', 'VBP'))
		for gram in self.taggedBigrams:
			tag1  = gram[0][1]
			tag2  = gram[1][1]
			word1 = gram[0][0]
			word2 = gram[1][0]

			if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ):
				output.append( "{0} {1}".format(word1, word2) )
		return output


	def isAdj(self, tag):
		return tag=='JJ'

	def isNounOrForeignWord(self, tag):
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		return tag in nouns

	"""
	def bigramsList(self):		
		pass
	"""
	def stringifyList(self, list):
		output = []
		for tag in list:
			output.append( str(tag.encode('utf-8')) )
		
		return output

	def stringifyTuples(self, tuples):
		output = []
		for tag in tuples:
			output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) )
		
		return output


	"""
	returns list of tuples of tagged words in text
	"""
	def analyze(self):
		output = []
		for sentence in self.sentences:
			taggedWords = self.stanford.tag( word_tokenize( sentence.lower() ) )
			output.append(taggedWords)

		return self.stringifyTuples(taggedWords)

	"""
	returns list of nouns and foreign words
	"""
	def filterNounsInText(self):
		output = set()
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']

		for sentence in self.sentences:
			taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
			for item in taggedWords:
				if item[1] in nouns:
					output.add( item[0] )
		
		return self.stringifyList( list(output) )

	



	def cleanWords(self):
		input = ''
		for item in self.words:
			input = "{0} {1}".format(input, item)

		input = re.sub('\n+', " ", input)
		input = re.sub('\[[0-9]*\]', "", input)
		input = re.sub(' +', " ", input)
		input = bytes(input)
		input.decode('ascii', 'ignore')

		input = input.split(" ")
		cleanInput = []

		for item in input:
			item = item.strip( string.punctuation )

			if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
			    cleanInput.append( item )

		return cleanInput



	def bigramNouns(self, text):
		nouns = self.filterNouns(text)		
		
		

	def isTagNounOrForeignWord(self, word):
		output = False
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		taggedWords = self.stanford.tag( word.lower()  )
		for item in taggedWords:
			if item[1] in nouns:
				output = True
				break
		return output

	@staticmethod
	def filterNouns(self, input):
		output = set()
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		sentences = sent_tokenize(input)
		for sentence in sentences:
			taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
			for item in taggedWords:
				if item[1] in nouns:
					output.add( item[0] )
		nList = list(output)
		return self.stringifyTuples(nList)

	@staticmethod
	def define( self, word ):	

		definitions = []	
		try:
			synsets = wn.synsets(word)
			for synset in synsets:
				definitions.append (synset.definition())
		except ValueError:
			print "Cannot define '{0}'".format(word)

		return definitions

	def sentenceExamples( self, noun):
		output = []
		try:
			synsets = wn.synsets(noun)
			for synset in synsets:
				examples = synset.examples()
				for example in examples:
					output.append( example )
		except ValueError, AttributeError:
			print "Cannot find any example for '{0}'".format(noun)

		return output
コード例 #52
0
os.environ['STANFORD_MODELS'] = \
'C:/stanford_data/stanford-parser-3.5.2-models.jar'

parser = stanford.StanfordParser(model_path= \
"C:/stanford_data/englishPCFG.ser.gz")


parsed_sentences = parser.raw_parse( \
(my_sentence))

for i in parsed_sentences:
    for k in i:
        print(k)

# GUI
for line in parsed_sentences:
    for sentence in line:
        sentence.draw()

sys.exit()

st = StanfordPOSTagger(r'C:/stanford_data/english-bidirectional-distsim.tagger',r'C:/stanford_data/stanford-postagger.jar')

bobo = st.tag(my_sentence.split())

print(bobo)

for i in bobo:
    print(i)
    
コード例 #53
0
ファイル: tagger.py プロジェクト: sureshbvn/nlpProject
from nltk.tag.stanford import StanfordPOSTagger
import nltk
import os

os.environ['CLASSPATH'] = "/home/vishesh/Downloads/stanford-postagger-full-2015-12-09/"

english_postagger = StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

print english_postagger.tag(nltk.word_tokenize('this is stanford postagger in nltk for python users'))


fo = open('europarl-v7.de-en.de','r')
data = fo.read()
fo.close()

fw = open('europarl_tags_testing.txt','w')

data = data.decode('utf-8')
data = data.split('\n')

#tokens = data.split()
#print len(tokens)

#print 'Tagging...'

german_postagger = StanfordPOSTagger('/home/vishesh/Documents/NLP/postagger/models/german-fast-caseless.tagger')
for i in range(10000,11500):
	tokens = nltk.word_tokenize(data[i])
	
	tags = german_postagger.tag(tokens)
コード例 #54
0
ファイル: test_Doc2Vec.py プロジェクト: kaminem64/itunes
from gensim import matutils
from math import log
from collections import Counter, defaultdict

from sklearn.decomposition import NMF

from nltk.tokenize import TweetTokenizer
import nltk
import re
import os
path = '/home/kaminem64/stanford'
os.environ['CLASSPATH'] = '%s/stanford-postagger-full-2015-04-20/stanford-postagger.jar:%s/stanford-ner-2015-04-20/stanford-ner.jar:%s/stanford-parser-full-2015-04-20/stanford-parser.jar:%s/stanford-parser-full-2015-04-20/stanford-parser-3.6.0-models.jar' %(path, path, path, path)
os.environ['STANFORD_MODELS'] = '%s/stanford-postagger-full-2015-04-20/models:%s/stanford-ner-2015-04-20/classifiers' %(path, path)
from nltk.tag.stanford import StanfordPOSTagger

stanford_pos_tag = StanfordPOSTagger('english-bidirectional-distsim.tagger')

import xlsxwriter
workbook = xlsxwriter.Workbook('topic_modeling.xlsx')
worksheet = workbook.add_worksheet()
row_num = 0
worksheet.write(row_num, 0, 'store_app_id')
worksheet.write(row_num, 1, 'name')
worksheet.write(row_num, 2, 'start_date')
worksheet.write(row_num, 3, 'end_date')
worksheet.write(row_num, 4, 'release_note')
worksheet.write(row_num, 5, 'topics')

app_ids = [307906541]#, 282614216, 383298204, 421254504, 509993510, ]
previous_date = None
コード例 #55
0
class Parser(object):
    modeldir = os.path.abspath(BASE_DIR + "/weiss/planner/models/")
    stopword_path = modeldir + "/english.stp"

    def __init__(self):
        self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger',
                                           self.modeldir + '/postagger/stanford-postagger.jar')
        self._stemmer = nltk.SnowballStemmer("english")
        self._stopwords = stopword(self.stopword_path)
        self._type_words = self._set_type_words()
        self._sentiment = self._get_sentiment()


    def _get_sentiment(self):
        sentiment = {}
        for line in open(self.modeldir + "/AFINN.txt"):
            word, score = line.split('\t')
            sentiment[word] = int(score)
        return sentiment

    def calculate_sentiment(self, query):
        tokens = nltk.word_tokenize(query)
        score = 0
        for token in tokens:
            if token in self._sentiment:
                score += self._sentiment[token]
        return score

    def entity_recognition(self, query, arguments):
        """Parse query and extract keywords

        This function is called in planner

        Args:
            query: query needs to be parsed
            arguments: info needs to be updated
        """
        tokens = nltk.word_tokenize(query)
        tags = self._postagger.tag(tokens)

        tuples = []

        for tag in tags:
            if tag[0] in self._stopwords:
                continue
            stemmed = self._stemmer.stem(tag[0])
            if stemmed in self._type_words['movie']:
                continue
            if stemmed in self._type_words['article']:
                continue
            if stemmed in self._type_words['restaurant']:
                continue
            if tag[1][:2] == 'NN' or tag[1][:2] == 'JJ':
                tuples.append(tag[0])

        if len(tuples) > 0:
            arguments['keywords'] = tuples
            logger.info("Here are the keywords: %s" % arguments['keywords'])

    def _set_type_words(self):
        """Initialize synonymy words of movie, article and restaurant

        This function is called during initialization

        Return: A dictionary, key: movie, article, restaurant, value: their synonymy words
        """
        topic = {}
        movie = ['cinema', 'show', 'film', 'picture', 'cinematograph',
                 'videotape', 'flick', 'pic', 'cine', 'cinematics', 'photodrama',
                 'photoplay', 'talkie', 'flicker', 'DVD', 'movie']
        article = ['report', 'announcement', 'story', 'account',
                   'newscast', 'headlines', 'press', 'communication', 'talk', 'word',
                   'communique', 'bulletin', 'message', 'dispatch', 'broadcast',
                   'statement', 'intelligence', 'disclosure', 'revelation',
                   'gossip', 'dispatch', 'news', 'article']
        restaurant = ['bar', 'cafeteria', 'diner', 'dining', 'saloon', 'coffeehouse',
                      'canteen', 'chophouse', 'drive-in', 'eatery', 'grill', 'lunchroom', 'inn', 'food',
                      'pizzeria', 'hideaway', 'cafe', 'charcuterie', 'deli', 'restaurant']
        for m in movie:
            topic.setdefault('movie', set([]))
            topic['movie'].add(self._stemmer.stem(m))
        for a in article:
            topic.setdefault('article', set([]))
            topic['article'].add(self._stemmer.stem(a))
        for r in restaurant:
            topic.setdefault('restaurant', set([]))
            topic['restaurant'].add(self._stemmer.stem(r))
        return topic


    def type_recognition(self, query, arguments):
        """Identity the type of the topic: movie, article or restaurant

        This is called in planner

        Args:
            query: query needs to be parsed
            arguments: info needs to be updated

        """
        tokens = nltk.word_tokenize(query)
        first = self._stemmer.stem(tokens[0])
        last = self._stemmer.stem(tokens[-1])
        lastsecond = self._stemmer.stem(tokens[-2]) if len(tokens) > 1 else "toy"
        if (first in self._type_words['article'] or last in self._type_words['article']
            or lastsecond in self._type_words['article']):
            arguments['tid'] = Type.News
        elif (first in self._type_words['restaurant'] or last in self._type_words['restaurant']
              or lastsecond in self._type_words['restaurant']):
            arguments['tid'] = Type.Restaurant
        elif (first in self._type_words['movie'] or last in self._type_words['movie']
              or lastsecond in self._type_words['movie']):
            arguments['tid'] = Type.Movie
        else:
            arguments['tid'] = Type.Unknown


    @staticmethod
    def _string_to_idx(number):
        if number == 'first' or number == 'one':
            return 0
        if number == 'second' or number == 'two':
            return 1
        if number == 'third' or number == 'three':
            return 2
        if number == 'fourth' or number == 'four':
            return 3
        if number == 'fifth' or number == 'five':
            return 4


    @staticmethod
    def keyword_matching(arguments, entities):
        words = arguments['keywords']
        phonics = set([])
        overlap = []

        for w in words:
            phonics.add(fuzzy.nysiis(w))

        for i in xrange(0, len(entities)):
            entity_name = nltk.word_tokenize(entities[i].name)
            entity_phonics = set([])
            for word in entity_name:
                entity_phonics.add(fuzzy.nysiis(word))
            common = len(phonics & entity_phonics) / len(entity_phonics)
            if common == 1:
                arguments['idx'] = i
                return
            overlap.append(common)
        arguments['idx'] = overlap.index(max(overlap))


    def find_number(self, query, arguments, entities):
        tokens = nltk.word_tokenize(query)
        tags = self._postagger.tag(tokens)
        last = query.find('last')

        # Edge case, "first" cannot be tagged correctly
        if len(query.split(" ")) <= 3 and query.find('first') != -1:
            arguments['idx'] = 0
            return 

        number = None
        for t in tags:
            if t[1] == 'JJ' and t[0][-2:] in set(['th', 'nd', 'st', 'rd']):
                number = t[0]
                break
            elif t[1] == 'CD' and t[0]:
                number = t[0]
                if number.isdigit() and int(number) < 6:
                	arguments['idx'] = int(number) - 1
                	return
                break

        if number is not None:
            if last == -1:
                arguments['idx'] = self._string_to_idx(number)
            else:
                arguments['idx'] = len(entities) - self._string_to_idx(number) - 1
コード例 #56
0
class NLTKHelper(object):
    """docstring for NLTKHelper"""
    def __init__(self, text):
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.text = text

        root = os.path.dirname(os.path.realpath(_file_))
        os.environ["STANFORD_PARSER"] = root+
        os.environ["STANFORD_MODELS"] = root+
        _path_to_model = root + ''
        _path_to_jar = root + ''
        self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar)
        self.sentences = sent_tokenize(text.encode("utf-8"))
        self.words = word_tokenize(text.encode("utf-8"))

        self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower())))
        #cleanWords

        self.taggedBigrams = self.ngramsAndTags(2)
        #print self.words

    def  personal_names(self):
        output = []

        for  gram in self.taggedBigrams:
            tag1  = gram[0][1]
            tag2  = gram[1][1]
            word1 = gram[0][0]
            word2 = gram[1][0]

            if  self.isPersonalName(tag1) and self.isPersonalName(tag2):
                output.append("{0} {1}".format(word1, word2))
        
        return output

    def isPersonalName(self, tag):
        return tag == "NNP" or tag == "FW"

    def preprocessTitle(self):

        output = ''
        for taggedWord in self.tags:
            word = taggedWord[0]
            tag  = taggedWord[1]

            if self.isPersonalName(tag):
                output = "{0} {1}".format(output, word.title())
            else:
                output = "{0} {1}".format(output, word.lower())

            return output

    def ngramsAndTags(self, n):
        output = []
        for i in range(len(self.tags)-n+1):
            gram = (self.tags[i],)
            for j in range(i+1, i+n):
                gram +=(self.tags[j], )
            output.append(gram)
        return output

    def sortFrequencies(self, ngram):
        return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)

    def findTags(self):
        output = []

        for gram in self.taggedBigrams:
            tag1 = gram[0][1]
            tag2 = gram[1][1]
            word1 = gram[0][0]
            word2 = gram[1][0]

            if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ):
                output.append( "{0} {1}".format(word1, word2) )
            return output

    def isAdj(self, tag):
        return tag=='JJ'

    def isNounOrForeignWord(self, tag):
        nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
        return tag in nouns

    def stringifyList(self, list):
        output = []
        for tag in list:
            output.append( str(tag.encode('utf-8')) )
        
        return output

    def stringifyTuples(self, tuples):
        output = []
        for tag in tuples:
            output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) )
        
        return output

    #returns list of tuples of tagged words in text
    def analyze(self):
        output = []
        for sentence in self.sentences:
            taggedWords = self.stanford.tag(word_tokenize(sentence.lower()))
            output.append(taggedWords)

            return self.stringifyTuples(taggedWords)

    def filterNounsInText(self):
        output = set()nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']

        for sentence in self.sentences:
            taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
            for item in taggedWords:
                if item[1] in nouns:
                    output.add( item[0] )
        
        return self.stringifyList( list(output) )

    def cleanWords(self):
        input = ''
        for item in self.words:
            input = "{0} {1}".format(input, item)

        input = re.sub('\n+', " ", input)
        input = re.sub('\[[0-9]*\]', "", input)
        input = re.sub(' +', " ", input)
        input = bytes(input)
        input.decode('ascii', 'ignore')

        input = input.split(" ")
        cleanInput = []

        for item in input:
            item = item.strip( string.punctuation )

            if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
                cleanInput.append( item )

        return cleanInput

    def bigramNouns(self, text):
        nouns = self.filterNouns(text)

    def isTagNounOrForeignWord(self, word):
        output = False
        nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
        taggedWords = self.stanford.tag( word.lower()  )
        for item in taggedWords:
            if item[1] in nouns:
                output = True
                break
        return output

    @staticmethod
    def filterNouns(self, input):
        output = set()
        nouns = ['NN', 'NNS', 'NNPS', 'FW']
        sentences = sent_tokenize(input)
        for sentence in sentences:
            taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
            for item in taggedWords:
                if item[1] in nouns:
                    output.add( item[0] )
        nList = list(output)
        return self.stringifyTuples(nList)

    @staticmethod
    def define(self, word):

        definitions = []
        try:
            synsets = wn.synsets(word)
            for synset in synsets:
                definitions.append(synset.definition())
            except ValueError:
                print "Cannot define '{0}'".format(word)

            except definitions
コード例 #57
0
ファイル: stanford_pos.py プロジェクト: sara-02/nltk_study
# -*- coding: utf-8 -*-
import nltk
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize

#the path where you have downloaded and unziped the full parser.
sp_dir = '/home/sarah/postagger/'
english_model = sp_dir + 'models/english-bidirectional-distsim.tagger'
chinese_model = sp_dir + 'models/chinese-distsim.tagger'
jar_path = sp_dir + 'stanford-postagger.jar'

#testing the english POS tagger
print "For the English model"
st_eng = StanfordPOSTagger(model_filename = english_model, path_to_jar = jar_path)
eng_sent = 'This is Stanford postagger in nltk for Python users.'
print eng_sent
eng_tokens = word_tokenize(eng_sent)
eng_tagged = st_eng.tag(eng_tokens)
for i in eng_tagged:
	print i

#testing for the chinese POS tagger
print "\n\nFor the Chinese model"
st_chi = StanfordPOSTagger(model_filename = chinese_model, path_to_jar = jar_path,encoding = 'utf-8')
chi_sent = '这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'
print chi_sent
chi_tokens = word_tokenize(chi_sent)
chi_tagged = st_chi.tag(chi_tokens)
for i in chi_tagged:
	print i
#print st_chi.tag('这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'.split())
コード例 #58
0
from nltk import pos_tag,word_tokenize
#from Utils import getQues
#txt=getQues()
#txt="benim adim yahya"
from nltk.tag.stanford import StanfordPOSTagger
txt="i am dentist"
tgr=StanfordPOSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar')
print  tgr.tag(word_tokenize(txt))