def pos_tag(sent):
	sent_pos = nltk.pos_tag(nltk.wordpunct_tokenize(sent.lower()))
	simplified = []
	for w, pos in sent_pos:
		if simplify_wsj_tag(pos):
			simplified.append( (w, simplify_wsj_tag(pos)) )
		else:
			simplified.append( (w, 'U') )
	return simplified
def otkloni_nepozeljne(all_words, koliko):
    tagged_sent = nltk.pos_tag(all_words)
    simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]
    useful_words = [
        t[0] for t in simplified if t[0] != '"' and (ok(t[1]) or t[0] == '--')
    ]
    return useful_words[:koliko]
Esempio n. 3
0
def pos_tag(text, simple=False):
    """ Tokenizes a given text and determines the pos-tags. Lowercases
        the text.

     Params:
        text: string to be tokenized
        simple: boolean indicating weather to simplify the pos tags

    Returns:
        list of tuples of form (token, pos-tag)
    """

    blob = TextBlob(text.lower())
    pos = blob.tags

    # simplify tags if requested
    if simple:
        simple_pos = []
        for word, tag in pos:
            new_tag = simplify_wsj_tag(tag)
            # simplification removes some tags
            # not allowed to use empty tag so use initial one
            if not new_tag:
                new_tag = tag
            simple_pos.append((word, new_tag))
        pos = simple_pos

    return pos
Esempio n. 4
0
def simplify_chunk(chunk):
    if isinstance(chunk, Tree):
        return Tree(chunk.node, [simplify_chunk(c) for c in chunk])
    elif isinstance(chunk, tuple):
        word, tag = chunk
        return (word, simplify_wsj_tag(tag))
    else:
        return chunk
Esempio n. 5
0
def simplify_chunk(chunk):
	if isinstance(chunk, Tree):
		return Tree(chunk.node, [simplify_chunk(c) for c in chunk])
	elif isinstance(chunk, tuple):
		word, tag = chunk
		return (word, simplify_wsj_tag(tag))
	else:
		return chunk
def pos_tokens(essay):
	#converts an essay into a bag of parts of speech
	text = nltk.word_tokenize(essay)
	tagged_text = nltk.pos_tag(text)
	simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_text]
	pos_tags_only =""  # how do I just inittialize a list, like this??
	for word in range(0,len(simplified)):
		pos_tags_only = pos_tags_only +" " + simplified[word][1]
	return pos_tags_only
Esempio n. 7
0
def average_pos_entropy_storeys(model, filename):
  with open(filename, 'r') as f:
    entropies = []
    for l in f:
      word_list = nltk.word_tokenize(l)
      if len(word_list) > 1:
        simplified = [simplify_wsj_tag(t) for _, t in nltk.pos_tag(word_list)]
        entropies.append(model.entropy(simplified))
  return ((sum(entropies) - 0.0) / len(entropies), entropies)
Esempio n. 8
0
def filter_words(tokens): #filter nouns and verbs
    t = nltk.pos_tag(tokens)
    filt = []
    for word, tag in t:
        simptag = simplify_wsj_tag(tag)
        if simptag == 'N':
            filt.append([word,'n'])
        elif simptag == 'V':
            filt.append([word,'v'])
    return filt
Esempio n. 9
0
def average_pos_entropy(model, filename):
  '''Get the average entropy of sentences in this file'''
  with open(filename) as f:
    entropies = []
    lines = f.read()
    for l in nltk.sent_tokenize(lines):
     word_list = nltk.word_tokenize(l)
     if len(word_list) > 1:
      simplified = [simplify_wsj_tag(t) for _, t in nltk.pos_tag(word_list)]
      entropies.append(model.entropy(simplified))
  return ((sum(entropies) - 0.0) / len(entropies), entropies)
 def simplify_tag(tag):
     # Keep POS tag as it is useful: many terms contain possessive: "'s"
     if tag.lower() == 'pos':
         return 'POS'
     else:
         simple_wsj_tag = simplify_wsj_tag(tag)
     if not simple_wsj_tag:
         # Convert '' tags to 'EMPTY' as RegexpParser doesn't like the
         # empty wsj simplified tag.
         return 'EMPTY'
     else:
         return simple_wsj_tag
 def simplify_tag(tag):
     # Keep POS tag as it is useful: many terms contain possessive: "'s"
     if tag.lower() == 'pos':
         return 'POS'
     else:
         simple_wsj_tag = simplify_wsj_tag(tag)
     if not simple_wsj_tag:
         # Convert '' tags to 'EMPTY' as RegexpParser doesn't like the 
         # empty wsj simplified tag.
         return 'EMPTY'
     else:
         return simple_wsj_tag
Esempio n. 12
0
def removeStopwords(sentence):
	'''Remove StopWords'''
	ret = []
	orig = []
	temp = nltk.word_tokenize(sentence)
	temp = nltk.pos_tag(temp)
	stmr = WordNetLemmatizer()
	temp = [(word, simplify_wsj_tag(tag)) for word, tag in temp]
	sen = [ stmr.lemmatize(x.lower(),tag[0].lower()) for x,tag in temp if tag in ['N','NP','NUM','V','VD','VG','VN']]
	
	#sen = [ stmr.lemmatize(word.lower(),'v') for word in re.sub("[^\w]"," ",sentence).split() if word.lower() not in stopwords.words('english') ]
	return sen
Esempio n. 13
0
def prepare_input(sentence):
    words = []
    sentences = nltk.sent_tokenize(sentence)
    for sent in sentences:
        words = words + nltk.word_tokenize(sent)
    pos = nltk.pos_tag(words)
    pos = [simplify_wsj_tag(tag) for word, tag in pos]
    words = [w.lower() for w in words]
    trigrams = nltk.trigrams(words)
    trigrams = ['%s/%s/%s' % (i[0], i[1], i[2]) for i in trigrams]
    features = words + pos + trigrams
    features = dict((f, True) for f in features)
    return features
Esempio n. 14
0
def prepare_input(sentence):
    words = []
    sentences = nltk.sent_tokenize(sentence)
    for sent in sentences:
        words = words + nltk.word_tokenize(sent)
    pos = nltk.pos_tag(words)
    pos = [simplify_wsj_tag(tag) for word, tag in pos]
    words = [w.lower() for w in words]
    trigrams = nltk.trigrams(words)
    trigrams = ['%s/%s/%s' % (i[0], i[1], i[2]) for i in trigrams]
    features = words + pos + trigrams
    features = dict((f, True) for f in features)
    return features
Esempio n. 15
0
def removeStopwords(sentence):
	'''Remove Stop words and stem the sentence. It also splits the sentences into words before stemming. '''
	# TODO([email protected]) : Add part of speach to each word hence produceds
	ret = []
	orig = []
	temp = nltk.word_tokenize(sentence)
	temp = nltk.pos_tag(temp)
	stmr = WordNetLemmatizer()
	temp = [(word, simplify_wsj_tag(tag)) for word, tag in temp]
	#sen = [ stmr.lemmatize(x.lower(),'n') for x,tag in temp if tag in ['N','NP','NUM']]
	
	sen = [ stmr.lemmatize(word.lower(),'v') for word in re.sub("[^\w]"," ",sentence).split() if word.lower() not in stopwords.words('english') ]
	return sen
Esempio n. 16
0
    def process_tips(tips):

        tags = set()
        index = 0
        for tip in tips:
            tagged_tip = TipPosTagger.tag_text(tip)
            simplified = [(word, simplify_wsj_tag(tag)) for word, tag in
                          tagged_tip]
            for tagged_word in simplified:
                tags.add(tagged_word[1])
            index += 1
            print(index)

        print(tags)
Esempio n. 17
0
    def process_tips(tips):

        tags = set()
        index = 0
        for tip in tips:
            tagged_tip = TipPosTagger.tag_text(tip)
            simplified = [(word, simplify_wsj_tag(tag))
                          for word, tag in tagged_tip]
            for tagged_word in simplified:
                tags.add(tagged_word[1])
            index += 1
            print(index)

        print(tags)
Esempio n. 18
0
def removeStopwords(sentence):
    '''Remove StopWords'''
    ret = []
    orig = []
    temp = nltk.word_tokenize(sentence)
    temp = nltk.pos_tag(temp)
    stmr = WordNetLemmatizer()
    temp = [(word, simplify_wsj_tag(tag)) for word, tag in temp]
    sen = [
        stmr.lemmatize(x.lower(), tag[0].lower()) for x, tag in temp
        if tag in ['N', 'NP', 'NUM', 'V', 'VD', 'VG', 'VN']
    ]

    #sen = [ stmr.lemmatize(word.lower(),'v') for word in re.sub("[^\w]"," ",sentence).split() if word.lower() not in stopwords.words('english') ]
    return sen
Esempio n. 19
0
File: wsd.py Progetto: fa97/cs4740
def read_file(file_object):
    lines = file_object.readlines()
    for line in lines:
        print "#######LINE#######"
        print line

        text = PunktWordTokenizer().tokenize(line)
        #text = nltk.wordpunct_tokenize(line)
        print "#######TEXT#######"
        print text

        """
        STOP WORD
        """
        stopwords = nltk.corpus.stopwords.words('english')
        content = [w for w in text if w[0].lower() not in stopwords]
        print "#######STOP WORD#######"
        print content

        """
        POS TAGGING
        """
        tagged_sent = nltk.pos_tag(content)
        tagged_sent = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]
        print "#######POS#######"
        print tagged_sent

        """
        STEMMING
        """
        #tagged_sent = tuple(tagged_sent)
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stem_word = ""
        for wrd in tagged_sent:
            stem_word = stem_word + " " + stemmer.stem(wrd[0])
        print "#######STEMMING#######"
        print stem_word
        """
        LEMMATIZING
        """
        print tagged_sent
        lmtzr = WordNetLemmatizer()
        sent = ""
        for wrd in tagged_sent:
            sent = sent + " " + lmtzr.lemmatize(wrd[0])
        print "#######LEMMA"""""""
        print sent
Esempio n. 20
0
def get_pos_dict(z):
    print(z[0])
    text = z[1]
    pos = []
    posd = defaultdict(int)
    # seems slower, despite the promising name
    # pos = batch_pos_tag(text)
    # pos = [tag for sent in batch_pos_tag(map(word_tokenize,
    #                                          sent_tokenize(text.strip())))
    #        for tag in sent]
    for s in sent_tokenize(text):
        wk = word_tokenize(s)
        pos += pos_tag(wk)

    for p in pos:
        posd[simplify_wsj_tag(p[1])] += 1
    return posd
Esempio n. 21
0
def read_file(file_object):
    lines = file_object.readlines()
    for line in lines:
        print "#######LINE#######"
        print line

        text = PunktWordTokenizer().tokenize(line)
        #text = nltk.wordpunct_tokenize(line)
        print "#######TEXT#######"
        print text
        """
        STOP WORD
        """
        stopwords = nltk.corpus.stopwords.words('english')
        content = [w for w in text if w[0].lower() not in stopwords]
        print "#######STOP WORD#######"
        print content
        """
        POS TAGGING
        """
        tagged_sent = nltk.pos_tag(content)
        tagged_sent = [(word, simplify_wsj_tag(tag))
                       for word, tag in tagged_sent]
        print "#######POS#######"
        print tagged_sent
        """
        STEMMING
        """
        #tagged_sent = tuple(tagged_sent)
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stem_word = ""
        for wrd in tagged_sent:
            stem_word = stem_word + " " + stemmer.stem(wrd[0])
        print "#######STEMMING#######"
        print stem_word
        """
        LEMMATIZING
        """
        print tagged_sent
        lmtzr = WordNetLemmatizer()
        sent = ""
        for wrd in tagged_sent:
            sent = sent + " " + lmtzr.lemmatize(wrd[0])
        print "#######LEMMA" """"""
        print sent
Esempio n. 22
0
File: wsd.py Progetto: fa97/cs4740
def process_word_context(entire_context):
    #remove punct
    cont_without_punct = entire_context.translate(string.maketrans('', ''), r'!"#$&\'()*+,-./:;<=>?@[\\]^_`{}~')
    cont_without_punct = " ".join(cont_without_punct.split())
    #pos tagging
    tagged_sent = [(word, simplify_wsj_tag(tag)) for word, tag in nltk.pos_tag(cont_without_punct.split(" "))]

    #stop words removal
    stopwords = nltk.corpus.stopwords.words('english')
    pos_tag_without_stopwords = [wrd for wrd in tagged_sent if wrd[0].lower() not in stopwords]

    for i, pos_tag_tuple in enumerate(pos_tag_without_stopwords):
        if pos_tag_tuple[0] == '%%':
            prev_context, target_word, next_context = pos_tag_without_stopwords[:i], pos_tag_without_stopwords[i + 1], \
                                                      pos_tag_without_stopwords[i + 3:]
            break

    return prev_context, next_context
Esempio n. 23
0
 def analyzePosAndSyllables(self, text):
     tags = nltk.pos_tag(nltk.word_tokenize(text))
     tags = [(word, simplify_wsj_tag(tag)) for word, tag in tags]
     adjadvtag = ['ADJ', 'ADV']
     verbtag = ['V', 'VD', 'VG', 'VN'] # leaving out modal verbs MOD
     nountag = ['N', 'NP'] # leaving out pronouns PRO
     for p in tags:
         syl = self.nsyl(p[0])[0]
         if p[1] in adjadvtag:
             self.nadjadv = self.nadjadv + 1
         if p[1] in verbtag:
             self.nverbs = self.nverbs + 1
         if p[1] in nountag:
             self.nnouns = self.nnouns + 1
         if syl > 0:
             self.nsyllables = self.nsyllables + 1
         if syl >= 3:
             self.npolysyllables = self.npolysyllables + 1
Esempio n. 24
0
    def __get_features(self, text):
        """
        Given a string, tokenize, tag, and return a normalized set of features.

        Returns { feature: <True>, ... }
        """
        words = []
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            words = words + nltk.word_tokenize(sentence)
            pos = nltk.pos_tag(words)
            # TODO verify simplify_wsj_tag increases accuracy
            pos = [simplify_wsj_tag(tag) for word, tag in pos]
            words = [i.lower() for i in words]
            trigrams = nltk.trigrams(words)
            trigrams = ["%s/%s/%s" % (i[0], i[1], i[2]) for i in trigrams]
            features = words + pos + trigrams
            features = dict([(i, True) for i in features])
            return features
Esempio n. 25
0
def make_wordlist():
    f = open(dict_location, 'r')
    dictionary = f.read()
    f.close()
    # Shortening the dictionary, starting around the start of the
    # lowercase letters in my unix dictionary. Otherwise the tokenize
    # takes too long. 
    dictwords = [w for w in nltk.word_tokenize(dictionary) if w != "'s" and w != "n't"]
    dictwords = dictwords[18300:]
    short_dict = []
    for i in xrange(len(dictwords)/10):
        short_dict.append(dictwords[10*i])
    words = []
    words = nltk.pos_tag(short_dict)
    
    # simplifying the part of speech of the words following advice at:
    # http://stackoverflow.com/a/5793083 here:
    simple_words = [(word, simplify_wsj_tag(tag)) for word, tag in words]
    return simple_words
 def preprocess(self, text=''):
     '''
     Given some text, return a list of words that correspond to desired parts of speech
     Only want to consider: nouns, proper nouns,and verbs 
     (presentense, pastence verb, present participle, past participle
     '''
     
     #return only the words in the line
     tokenized_words = nltk.word_tokenize(text.lower())
     #assign part of speech to each word in text
     pos_tagged_words = nltk.pos_tag(tokenized_words)
     #use simplified tagging for less parts of speech
     simplified_tagged_text = [(word, simplify_wsj_tag(tag)) for word, tag in pos_tagged_words]
 
     keywords = []
     for (word,part_of_speech) in simplified_tagged_text:    
         whitelist = ['N','NP', 'V', 'VD', 'VG', 'VN']
         if part_of_speech in whitelist:
             keywords.append(word)
     return keywords
Esempio n. 27
0
    def preprocess(self, text='',method=0):
        '''
        Given some text, return a list of words/tokens, after removing punctuations,
        stop words, and digits. There various methods that may be used, depending on the
        method flag, including part of speech tagging, and checking against various stop word
        lists
        '''

        #remove punctuation and digits
        out = text.translate(string.maketrans("",""), string.punctuation + string.digits)
        #return word tokens in the line
        tokenized_words = nltk.word_tokenize(out.lower())

        if method == 0:
            #indexes stopdict by first char of word if first char = letter
            keywords = [word for word in tokenized_words 
                        if ord(word[0]) in xrange(97,123) and word not in self.stopdict[word[0]]]

        elif method == 1:
            #use NLTK stopword list
            keywords = [word for word in tokenized_words
                        if word not in stopwords.words('english')]

        elif method == 2:
            #only keep words that correspond to desired parts of speech.
            #only want to consider: nouns, proper nouns,and verbs
            #(presentense, pastence verb, present participle, past participle

            tokenized_words = nltk.word_tokenize(text.lower())
            #assign part of speech to each word in text
            pos_tagged_words = nltk.pos_tag(tokenized_words)
            #use simplified tagging for less parts of speech
            simplified_tagged_text = [(word, simplify_wsj_tag(tag)) for word, tag in pos_tagged_words]
        
            keywords = []
            for (word,part_of_speech) in simplified_tagged_text:    
                whitelist = ['N','NP', 'V', 'VD', 'VG', 'VN']
                if part_of_speech in whitelist:
                    keywords.append(word)
                
        return keywords
Esempio n. 28
0
def preProcess(tweet, dicoSlang):

    preProcessedTokens = []
    tokens = tweet.split(" ")
    for token in tokens:
        if len(token) > 0:
            if token[0] != "@":
                if token[0] == "#":
                    token = token.replace("#", "")
                if token in dicoSlang:
                    newTokens = dicoSlang[token]
                    newTokensBis = newTokens.split(" ")
                    for newToken in newTokensBis:
                        preProcessedTokens.append(newToken)
                else:
                    preProcessedTokens.append(token)

    taggedData = nltk.pos_tag(preProcessedTokens)
    simplifiedData = [(word, simplify_wsj_tag(tag)) for word, tag in taggedData]

    return simplifiedData
Esempio n. 29
0
 def stem_sentences(self, content):
     stemmed_dict = dict()
     stemmed_text_dict = []
     sentences = self.split_content_to_sentences(content)
     for sentence in sentences:
         tokenized_sentence = self.format_sentence(sentence)
         tagged_sent = nltk.pos_tag(tokenized_sentence)
         relevant_words_in_sentence = []
         # Use built-in simplified tags.
         simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]
         for toople in simplified:
             if toople[1]  in ['V', 'VD', 'VG', 'VN', 'ADJ', 'NP', 'N']:
                 relevant_words_in_sentence.append(toople[0])
         # Get the stems of each sentence
         wnl = nltk.WordNetLemmatizer()
         stemmed_sent = [wnl.lemmatize(word) for word in relevant_words_in_sentence]
         stemmed_dict[sentence] = stemmed_sent
         stemmed_text_dict.append(stemmed_dict)
         relevant_words_in_sentence = []
         stemmed_dict = dict()
     return stemmed_text_dict
Esempio n. 30
0
    def preprocess(self, text=''):
        '''
        Given some text, return a list of words that correspond to desired parts of speech
        Only want to consider: nouns, proper nouns,and verbs 
        (presentense, pastence verb, present participle, past participle
        '''

        #return only the words in the line
        tokenized_words = nltk.word_tokenize(text.lower())
        #assign part of speech to each word in text
        pos_tagged_words = nltk.pos_tag(tokenized_words)
        #use simplified tagging for less parts of speech
        simplified_tagged_text = [(word, simplify_wsj_tag(tag))
                                  for word, tag in pos_tagged_words]

        keywords = []
        for (word, part_of_speech) in simplified_tagged_text:
            whitelist = ['N', 'NP', 'V', 'VD', 'VG', 'VN']
            if part_of_speech in whitelist:
                keywords.append(word)
        return keywords
Esempio n. 31
0
def getFreqDistOfUsefulWords(sentences):
	wordFreq = {}
	
	for sent in sentences:
		sanitized_sent = ''.join(e for e in sent if e.isalnum() or e == ' ')
		tokens = nltk.word_tokenize(sanitized_sent)
		tagged_tokens = nltk.pos_tag(tokens)
		simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_tokens]
		words_req = [key for key, val in simplified if val not in noneed]

		for word in words_req:
			if word in wordFreq:
				wordFreq[word] += 1
			else:
				wordFreq[word] = 1

	wordFreqOD = OrderedDict(sorted(wordFreq.items(), key=lambda t: t[1], reverse=True))

	for key, value in wordFreqOD.items():
		print key + " : " + str(value)

	return wordFreqOD
Esempio n. 32
0
def process_word_context(entire_context):
    #remove punct
    cont_without_punct = entire_context.translate(
        string.maketrans('', ''), r'!"#$&\'()*+,-./:;<=>?@[\\]^_`{}~')
    cont_without_punct = " ".join(cont_without_punct.split())
    #pos tagging
    tagged_sent = [(word, simplify_wsj_tag(tag))
                   for word, tag in nltk.pos_tag(cont_without_punct.split(" "))
                   ]

    #stop words removal
    stopwords = nltk.corpus.stopwords.words('english')
    pos_tag_without_stopwords = [
        wrd for wrd in tagged_sent if wrd[0].lower() not in stopwords
    ]

    for i, pos_tag_tuple in enumerate(pos_tag_without_stopwords):
        if pos_tag_tuple[0] == '%%':
            prev_context, target_word, next_context = pos_tag_without_stopwords[:i], pos_tag_without_stopwords[i + 1], \
                                                      pos_tag_without_stopwords[i + 3:]
            break

    return prev_context, next_context
Esempio n. 33
0
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['-sC66z4SO3tR7nFCjfQwuQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['QnAzW6KMSciUcuJ20oI3Bw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uKSX1n1RoAzGq4bV8GPHVg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['YKOvlBNkF4KpUP9q7x862w'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['aRkYtXfmEKYG-eTDf_qUsw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['pwpl-rxwNRQdgqFz_-qMPg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['3oZcTGb_oDHGwZFiP-7kxQ'])

my_tips = [my_record['text'] for my_record in business_records]
# TipPosTagger.process_tips(my_tips[:1000])


my_text = "The burgers are very good. The service is bad." + \
          "It is a great place to go with friends. I went there with my wife."
my_tags = TipPosTagger.tag_text(my_text)
simp = [(my_word, simplify_wsj_tag(my_tag)) for my_word, my_tag in my_tags]
print(simp)

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
my_sentences = tokenizer.tokenize(my_text)

# print sent_words

tip_pos_tagger = TipPosTagger()
tip_pos_tagger.analyze_tips(my_tips)

sorted_x = sorted(tip_pos_tagger.noun_dictionary.iteritems(),
                  key=operator.itemgetter(1))

print(sorted_x[:10])
print(sorted_x[-10:])
Esempio n. 34
0
def pos_tag_sentence(sent, simplify_tags=False):
    tagged = pos_tag(sent)
    if simplify_tags:
        tagged = [(word, simplify_wsj_tag(tag)) for word, tag in tagged]
    return tagged
Esempio n. 35
0
def __part_of_speech__(word):
    print "Doing POS lookup for", word
    tagged_sent = nltk.pos_tag([word])
    return simplify_wsj_tag(tagged_sent[0][1])
Esempio n. 36
0
def tagging(x):
        tokens = get_tokens(x)
        tagged = nltk.pos_tag(tokens)
        simple = ["/".join([word,simplify_wsj_tag(tag)]) for word,tag in tagged]
        y = " ".join(simple)
        return y
Esempio n. 37
0
	tagged_sents = tagged_corpus.tagged_posts(**kwargs)
else:
	if isinstance(tagged_corpus, IndianCorpusReader) and not fileids:
		fileids = 'hindi.pos'
	
	if fileids and fileids in tagged_corpus.fileids():
		kwargs['fileids'] = [fileids]
	
		if args.trace:
			print 'using tagged sentences from %s' % fileids
	
	tagged_sents = tagged_corpus.tagged_sents(**kwargs)

# manual simplification is needed for these corpora
if args.simplify_tags and args.corpus in ['conll2000', 'switchboard']:
	tagged_sents = [[(word, simplify_wsj_tag(tag)) for (word, tag) in sent] for sent in tagged_sents]

##################
## tagged sents ##
##################

# can't trust corpus to provide valid list of sents (indian)
tagged_sents = [sent for sent in tagged_sents if sent]
nsents = len(tagged_sents)

if args.fraction == 1.0:
	train_sents = test_sents = tagged_sents
else:
	cutoff = int(math.ceil(nsents * args.fraction))
	train_sents = tagged_sents[:cutoff]
	test_sents = tagged_sents[cutoff:]
wc = 0
tag_counts = FreqDist()
taglen = 7
word_set = set()

if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
    kwargs = {'simplify_tags': True}
else:
    kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
    if len(tag) > taglen:
        taglen = len(tag)

    if args.corpus in ['conll2000', 'switchboard'] and args.simplify_tags:
        tag = simplify_wsj_tag(tag)

    wc += 1
    # loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
    if not isinstance(tag, basestring): tag = str(tag)
    tag_counts.inc(tag)
    word_set.add(word)

############
## output ##
############

print '%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set),
                                                      len(tag_counts))

if args.sort == 'tag':
Esempio n. 39
0
def tag(tokens):
    tagged_sent = nltk.pos_tag(tokens)
    simplified = [simplify_wsj_tag(tag) for word, tag in tagged_sent]
    return simplified
Esempio n. 40
0
	def tag_pos(self,tweet):
		entity = {}					#entity for the current tweet
		topic = self.search_for_location_ref_tweet	#search for location references by regex match
		
		#weigh the hashtags, WEIGHTAGE = 6
		hashtags = self.hashtag.findall(tweet)
		for tag in hashtags:
			entity[tag] = 6 if tag not in entity else entity[tag] + 6
		
		#weigh the discounts, WEIGHTAGE = 1
		discounts = self.discount.findall(tweet)
		for discount in discounts:
			entity[discount] = 1 if discount not in entity else entity[discount] + 1
		
		#replace placeholders for tweets
		tweet = self.sub_placeholders(tweet)

		
		text = nltk.wordpunct_tokenize(tweet)
		tokens = nltk.pos_tag(text)
		simplified_tokens = [(word, simplify_wsj_tag(tag)) for word, tag in tokens]
		topic = ""
		
		for i in range(0,len(simplified_tokens)):
			#backword lookup on the basis of certain keywords
			if simplified_tokens[i][0].lower() == 'sale' or simplified_tokens[i][0].lower() == 'sales' or simplified_tokens[i][0].lower() == 'deal' or simplified_tokens[i][0].lower() == 'deals':
				j = i-1
				found = False
				while (j >= 0 and not(found)):
					#look for nouns. Once a nound is found, scan for all the immediately preceded nouns, else stop. these words get a weightage of 4 (higher)
					if (simplified_tokens[j][1]=='NP' or simplified_tokens[j][1]=='N') and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
						word = simplified_tokens[j][0].lower()
						if  j-1 >= 0:
							scan = True
							while j >= 0 and (scan):
								if (simplified_tokens[j-1][1] == 'NP' or simplified_tokens[j-1][1] == 'N' or simplified_tokens[j-1][1] == 'NUM' or simplified_tokens[j-1][0] == "'" or simplified_tokens[j-1][0] == "-") and simplified_tokens[j-1][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
									word = simplified_tokens[j-1][0].lower() + " " + word
									j = j-1
								else:
									scan = False
						if word!="":
							entity[word] = 4 if word not in entity else entity[word] + 4
							found = True
					j = j-1
			
			#forward lookup on the basis of certain obvious prepositions/noun/conjunction
			if simplified_tokens[i][0].lower() == 'off' or simplified_tokens[i][0].lower() == 'on' or simplified_tokens[i][0].lower() == 'at' or simplified_tokens[i][0].lower() == 'with' or simplified_tokens[i][0].lower() == 'deal':
				#if location reference, give a weightage of 3
				weightage = 6 if simplified_tokens[i][0].lower() == 'at' else 3
				j=i+1
				found = False
				word =""
				while(j < len(simplified_tokens) and not(found)):
					if (simplified_tokens[j][1]=='NP' or simplified_tokens[j][1]=='N' ) and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','on','rt','sale','.'):
						word = simplified_tokens[j][0].lower()
						j = j + 1
						if  j < len(simplified_tokens):
							scan = True
							while j < len(simplified_tokens) and (scan) :
								if (simplified_tokens[j][1] == 'NP' or  simplified_tokens[j][1] == 'N' or  simplified_tokens[j][1] == 'NUM' or simplified_tokens[j][0] == "-" or simplified_tokens[j][0] == ".") and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','rt','sale') :
									#if a . is found, look if the previous word is one lettered. Can be an abbreviation like j.crew
									if simplified_tokens[j][1] == '.':
										if len(simplified_tokens[j-1][0])==1 and j+1<len(simplified_tokens):
											word = word + "." +simplified_tokens[j+1][0]
											scan = False
										break
									word = word + " " + simplified_tokens[j][0].lower()
									j = j + 1
								else:
									if word =='':	#scan till a word is found
										j=j+1
									else:
										scan=False
						if word!="":
							entity[word] = weightage if word not in entity else entity[word] + weightage
							found = True
					j = j+1
		
		#check if the word identified as topic is in the global list of topics. If it is boost the scores to cluster them into one category
		for key in entity.keys():
			if key in self.global_topics:
				entity[key] = entity[key] + 10
			if key in self.global_topics_not:
				entity[key] = 0
		entity = sorted(entity.iteritems(), key=operator.itemgetter(1), reverse=True)	#sort the topics in descending order of their weights
		
		for i in range(0,len(entity)):
			topic = entity[i][0]
			if topic not in self.global_topics_not:
				#adjust the ' in the the topic list like gaurav's or can't ..etc
				if (len(topic.split(" "))>3):
					words = topic.split(" ")
					if words[len(words)-2] == "'":
						topic = words[len(words)-3] + " " + words[len(words)-2] + words[len(words)-1]
					elif words[len(words)-1] == "'":
						topic = words[len(words)-3] + " " + words[len(words)-2]
					else:
						topic = words[len(words)-2] + " " + words[len(words)-1]
				#had to adjust the topic for 2 words specifically.
				elif (len(topic.split(" "))>2):
					words = topic.split(" ")
					for i in range(0,len(words)):
						if i == 0:
							topic = words[i]
						elif words[i-1] == "'":
							topic = topic + words[i]
						elif words[i] == "'":
							topic = topic + words[i]
						else:
							topic = topic + " " +words[i]
				if topic not in self.global_topics_not and topic!='' and topic!=' ':
					if topic not in self.global_topics:
						self.global_topics[topic] = 1
					break
		return topic
Esempio n. 41
0
	def tag_pos(self,tweet):
		entity = {}
		
		hashtags = self.hashtag.findall(tweet)
		for tag in hashtags:
			entity[tag] = 2 if tag not in entity else entity[tag] + 2
		tweet = self.hashtag.sub('hashtag123 ',tweet)
		
		tweet = self.url.sub('url123 ', tweet)
		
		discounts = self.discount.findall(tweet)
		for discount in discounts:
			entity[discount] = 1 if discount not in entity else entity[discount] + 1
		tweet = self.discount.sub('discount123 ',tweet)
		
		tweet = self.person.sub('person123 ',tweet)
		
		text = nltk.wordpunct_tokenize(tweet)
		tokens = nltk.pos_tag(text)
		simplified_tokens = [(word, simplify_wsj_tag(tag)) for word, tag in tokens]
		
		topic = ""
		
		for i in range(0,len(simplified_tokens)):
			if simplified_tokens[i][0].lower() == 'sale':
				j = i-1
				found = False
				while (j > 0 and not(found)):
					if (simplified_tokens[j][1]=='NP' or simplified_tokens[j][1]=='N') and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
						word = simplified_tokens[j][0].lower()
						if  j-1 > 0:
							scan = True
							while j > 0 and (scan):
								if (simplified_tokens[j-1][1] == 'NP' or simplified_tokens[j-1][1] == 'N' or simplified_tokens[j-1][1] == 'NUM' or simplified_tokens[j-1][0] == "'" or simplified_tokens[j-1][0] == "-") and simplified_tokens[j-1][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
									word = simplified_tokens[j-1][0].lower() + " " + word
									j = j-1
								else:
									scan = False
						if word!="":
							entity[word] = 4 if word not in entity else entity[word] + 4
							found = True
					j = j-1
			
			if simplified_tokens[i][0].lower() == 'off' or simplified_tokens[i][0].lower() == 'on' or simplified_tokens[i][0].lower() == 'at':
				j=i+1
				found = False
				while(j < len(simplified_tokens) and not(found)):
					if simplified_tokens[j][1]=='NP' and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','on','.','rt'):
						word = simplified_tokens[j][0].lower()
						j = j+1
						if  j < len(simplified_tokens):
							scan = True
							while j < len(simplified_tokens) and (scan) :
								if (simplified_tokens[j][1] == 'NP' or  simplified_tokens[j][1] == 'N' or  simplified_tokens[j][1] == 'NUM' or simplified_tokens[j][0] == "-") and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','.','rt') :
									word = word + " " + simplified_tokens[j][0].lower()
									j = j + 1
								else:
									if word == "":
										j=j+1
									else:
										scan = False
						if word!="":
							entity[word] = 3 if word not in entity else entity[word] + 3
							found = True
					j = j+1
		for key in entity.keys():
			if key in self.global_topics:
				entity[key] = entity[key] + 10
		entity = sorted(entity.iteritems(), key=operator.itemgetter(1), reverse=True)
		#print entity
		if len(entity) > 0:
			topic = entity[0][0]
			if (len(topic.split(" "))>3):
				for i in range(1,len(entity)):
					if len(entity[i][0].split(" "))<3:
						topic = entity[i][0]
			if topic not in self.global_topics:
				self.global_topics[topic] = 1
			return topic
##############

wc = 0
tag_counts = FreqDist()
word_set = set()

if args.corpus in ['conll2000', 'switchboard']:
	kwargs = {}
elif args.simplify_tags:
	kwargs = {'simplify_tags': True}
else:
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if args.corpus in ['conll2000', 'switchboard'] and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	# loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts.inc(tag)
	word_set.add(word)

############
## output ##
############

print '%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts))

if args.sort == 'tag':
	sort_key = lambda (t, c): t
Esempio n. 43
0
def get_words_tags(str):
	text = nltk.word_tokenize(str)
	word_tags = nltk.pos_tag(text)
	return [(word, simplify_wsj_tag(tag)) for word, tag in word_tags]
Esempio n. 44
0
def getSentimentFromTweet7(tweet, dico, listNeg, listBoost, listPosEmoticons, listNegEmoticons, swn, dicoSlang):
    """
    input: tweet = string ; dico = dictionnary which links PoS of nltk to PoS of synsets, swn = sentiwordnet corpus reader, listNeg and listBoost
    return: posScore,negScore, sentiment = an int equal to 0 (negative), 2(neutral) or 4(positive) 
    
    Summary: Only the best neg and best pos are taken into account
    """

    tokenizedData = tweet.split(" ")
    PreProcessedTokenizedData = preProcess(tokenizedData, dicoSlang)
    taggedData = nltk.pos_tag(PreProcessedTokenizedData)
    simplifiedData = [(word, simplify_wsj_tag(tag)) for word, tag in taggedData]

    posScore = 0
    negScore = 0
    boosterWord = False

    for index, couple in enumerate(simplifiedData):
        synset = None
        if couple[0] in listPosEmoticons:
            if posScore < 1:
                posScore = 1
        elif couple[0] in listNegEmoticons:
            if negScore < 1:
                negScore = 1
        elif couple[1] in dico:
            synsets = swn.senti_synsets(couple[0])
            for elem in synsets:
                if elem.synset.pos == dico[couple[1]]:
                    synset = elem
                    break
            if synset != None:
                if synset.neg_score == synset.pos_score:
                    boosterWord = True
                elif index > 0:
                    coupleBefore = simplifiedData[(index - 1)]
                    wordBefore = coupleBefore[0]
                    if wordBefore in listNeg:
                        if posScore < synset.neg_score:
                            posScore = synset.neg_score
                        if negScore < synset.pos_score:
                            negScore = synset.pos_score
                    elif (wordBefore in listBoost) or (boosterWord):
                        if index > 1:
                            coupleBeforeBis = simplifiedData[(index - 2)]
                            wordBeforeBis = coupleBeforeBis[0]
                            if wordBeforeBis in listNeg:
                                if posScore < (synset.neg_score + 1):
                                    posScore = synset.neg_score + 1
                                if negScore < (synset.pos_score + 1):
                                    negScore = synset.pos_score + 1
                            else:
                                if posScore < (synset.pos_score + 1):
                                    posScore = synset.pos_score + 1
                                if negScore < (synset.neg_score + 1):
                                    negScore = synset.neg_score + 1
                        else:
                            if posScore < (synset.pos_score + 1):
                                posScore = synset.pos_score + 1
                            if negScore < (synset.neg_score + 1):
                                negScore = synset.neg_score + 1
                    else:
                        if index > 1:
                            coupleBeforeBis = simplifiedData[(index - 2)]
                            wordBeforeBis = coupleBeforeBis[0]
                            if wordBeforeBis in listNeg:
                                if posScore < synset.neg_score:
                                    posScore = synset.neg_score
                                if negScore < synset.pos_score:
                                    negScore = synset.pos_score
                            else:
                                if posScore < synset.pos_score:
                                    posScore = synset.pos_score
                                if negScore < synset.neg_score:
                                    negScore = synset.neg_score
                        else:
                            if posScore < synset.pos_score:
                                posScore = synset.pos_score
                            if negScore < synset.neg_score:
                                negScore = synset.neg_score
                    boosterWord = False
                else:
                    if posScore < synset.pos_score:
                        posScore = synset.pos_score
                    if negScore < synset.neg_score:
                        negScore = synset.neg_score
    if posScore > negScore:
        sentiment = 4
    elif posScore == negScore:
        sentiment = 2
    else:
        sentiment = 0

    return [posScore, negScore, sentiment]
Esempio n. 45
0
 def tag(self, sentence):
     tokens = nltk.word_tokenize(sentence)
     tagged = nltk.pos_tag(tokens)
     simpl_tagged = [(word, simplify_wsj_tag(tag)) for word, tag in tagged]
     return simpl_tagged
Esempio n. 46
0
verb = []
adjective = []

#For every line read in the file. 
for line in lines:
	c = []	 #An empty list that will be added to the main list. 2d List.
	d = []
	e = []
	noun.append(c)
	verb.append(d)
	adjective.append(e)

	tokens = nltk.word_tokenize(line)  #Tokenizing by word.
	tagged = nltk.pos_tag(tokens)

	simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged] #To create simplified POS tags.
	print simplified
	num = len(simplified)

	for i in xrange(num):
		# print simplified[i][0] ,   #Testing purpose.
		# print simplified[i][1]
		if simplified[i][0] not in useless:
			check(count,simplified[i]) #Function call
	count = count + 1	

#For sentences without any specific POS, we are appending NA to the list corresponding to the sentence. 
for li in noun:
	if (len(li) == 0):
		li.append("NA")
Esempio n. 47
0
def postag(word):
	try:
		return simplify_wsj_tag(pos_tag([word])[0][1])
	except:
		return "UNK"
Esempio n. 48
0
for u in f.lexUnit:
    print u

fn.lexical_units('r(?i)look')


from pattern.en import wordnet


[x for x in f.FE]
f.frameRelations

all_lu = set()
for f in fn.frames():    
    lus = [ lu.split('.')[1] for lu in fn.frame(f.ID).lexUnit ]
    for lu in lus:
        all_lu.add(lu)


import nltk
from nltk.tag.simplify import simplify_wsj_tag

from nltk import simple

tagged_sent = nltk.pos_tag(tokens)
simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]