コード例 #1
0
ファイル: test_en.py プロジェクト: EricSchles/pattern
 def test_tag(self):
     # Assert [("black", "JJ"), ("cats", "NNS")].
     v = en.tag("black cats")
     self.assertEqual(v, [("black", "JJ"), ("cats", "NNS")])
     v = en.tag("")
     self.assertEqual(v, [])
     print "pattern.en.tag()"
コード例 #2
0
	def get(self, description, index = False, field = False, debug = False, fields = False):
		"""Get automaticly authorities (All proper nouns) from descriptions, links them to said descriptions and return an index of links and authorities
		
		Keyword arguments:
		description	---	Either a description from an EHRI.get() descriptions list or an EHRI.get() descriptions list
		index	---	Index of items, if already exists
		field ---	Field to query, default is scopeAndContent
		debug ---	Debug mode : print details during execution
		fields	---	If more than one field
		
		"""
		if index:
			self.index = index
		if field:
			self.field = field
		if debug:
			self.debug = debug
		#If Description is a list of description, then we run a loop on it
		if isinstance(description, list):
			for element in description:
				self.get(element, fields = fields)
		else:
			if self.debug:
				print "Handling Item Id " + description[self.identifier]
			
			try:
				if fields:
					tokens = tag(". ".join([description[item] for item in description if item in fields]))
				else:
					tokens = tag(description[self.field])
			except:
				print "Tokenization failed for " + self.field
				sys.exit()

			i = 0
			entities = []
			while i < len(tokens):
				#Setting up temp variables
				name, pos = tokens[i]
				z = 1
				if pos == "NNP":
					entity_name = name
					#if tokens[i+z]:
					if i + z + 1 < len(tokens):
						while tokens[i+z][1] == "NNP" or (z + i + 1 < len(tokens) and tokens[i+z][0].lower() == "of" and tokens[i+z+1][1] == "NNP") :
							entity_name += " " + tokens[i+z][0]
							z += 1
							#Breaking it if not anymore in  index range
							if z + i == len(tokens):
								break
					self.index["authorities"].append(entity_name)
					if description["idDoc"] not in self.index["items"]:
						self.index["items"][description["idDoc"]] = []
					self.index["items"][description["idDoc"]].append(entity_name)
				i += z
		
		return self.index
コード例 #3
0
ファイル: model.py プロジェクト: vm/lessandmore
    def transform(self, text, less, more):
        """transforms a body of text to have less of less and more of more!

        :param text: text to transform
        :type text: str

        :param less: list of 'less' words
        :type less: list

        :param more: list of 'more' words
        :type more: list

        :returns: transformed text
        :rtype: str
        """

        last_was_article = False
        new_text = []

        less = [l for l in less if not self._ignore(l, en.tag(l)[0][1])]
        more = [m for m in more if not self._ignore(m, en.tag(m)[0][1])]

        # iterate over words
        for word, pos in en.tag(text):

            if word not in self.model or self._ignore(word, pos):
                if self._is_punc(pos):
                    new_text.append(u'\b' + word)
                else:
                    new_text.append(word)
            else:
                new_word = self._transform_word(word, pos, less, more)

                # handle 'a' v. 'an'
                if new_text and new_text[-1] in ['a', 'an']:
                    new_text[-1] = 'an' if new_word[0] in 'aeiou' else 'a'

                new_text.append(new_word)

        ret = ''

        # remove at backspaces - this is dumb
        for t in new_text:
            if t.startswith('\b'):
                ret += t[1:]
            else:
                ret += ((' ' + t) if ret != '' else t)

        return ret
コード例 #4
0
ファイル: textcleaner.py プロジェクト: abs51295/gensim
def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    >>> from gensim.summarization.textcleaner import clean_text_by_word
    >>> clean_text_by_word("God helps those who help themselves")
    {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
    'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
    'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}
コード例 #5
0
ファイル: botlang.py プロジェクト: d-baker/bots
    def posify(self, bottish, english, dictionary):
        # TODO incomplete - add more tags!
        tags = {
            ("NN", "NNS"): "noun",
            ("NNP", "NNPS"): "prop. noun",
            "CC": "conj.",
            "DT": "det.",
            "JJ": "adj.",
            ("PRP", "PRP$"): "pers. pron",
            ("RB", "RBS"): "adv.",
            "UH": "interj.",
            ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): "verb"
        }

        pos = ""
        for key in tags.keys():
            if tag(english)[0][1] in key:
                pos = tags[key]

        if pos == "noun":
            bottish = self.pluralify(bottish, english, dictionary)
        elif pos == "adj.":
            bottish = self.adjectify(bottish, english, dictionary)
        #elif pos == "verb":
        #    bottish = self.conjugify(bottish, english, dictionary)

        return [bottish, pos]
コード例 #6
0
    def create_description(self):
        pat = 'VB|VBD|VBZ|VBG * NN IN * NN'
        #pat = 'PRP * VB|VBD|VBZ|VBG * NN'
        phrases = search.search_out(self.source_text, pat)
        conjugated_phrases = []
        for phrase in phrases:
            words = []
            for word, pos in tag(phrase):
                if pos in ["VBZ", "VBD", "VB", "VBG"]:
                    words.append(conjugate(word, "3sg"))
                #elif pos == "NN" and random.random() < .1:
                    #words.append(self.define_word(word))
                else:
                    words.append(word)
            conjugated_phrases.append(' '.join(words))

        artifacts = list(self.artifacts)

        sentence_prefixes = ["The present invention", "The device", "The invention"]
        paragraph_prefixes = ["The present invention", "According to a beneficial embodiment, the invention", "According to another embodiment, the device", "According to a preferred embodiment, the invention", "In accordance with an alternative specific embodiment, the present invention"] 
        i = 0
        self.description = ''
        for phrase in conjugated_phrases:
            line = ""
            if i == 0:
                line = paragraph_prefixes[0] + " " + phrase
            else:
                if random.random() < .1:
                    line = "\n\n" + random.choice(paragraph_prefixes) + " " + phrase
                else:
                    line = random.choice(sentence_prefixes) + " " + phrase
            self.description += line + ". "
            i += 1
コード例 #7
0
ファイル: doc_analysis_final.py プロジェクト: dbgannon/sciml
def buildVectorizer(bio):
    nounlist = []
    for doc in bio:
        st = ""
        for (word, pos) in tag(doc):
            if pos in ["JJ", "NNS", "NN", "NNP"]:
                st = st+word+" "
            else:
                if st!= "":
                    st = st[0:-1]+" "
                    #print "got one"
        nounlist.extend([st])
    sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 
                    'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
                    'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
                    'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
                    'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
                    'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
                    'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
    #now doing the new vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    english = nltk.corpus.stopwords.words('english')
    newstop = english+list(sciencestopwords) 
    vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore')
    X = vectorizer.fit_transform(nounlist)
    Xinv = vectorizer.inverse_transform(X)
        #X is a sparse matrix of docs x vocab size (7638). 
    #so X[doc_num] is the sparse vector of its words. 
    #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros.
    #Xinv[doc_num] is the list of words in the doc.
     
    return nounlist, vectorizer, X, Xinv
コード例 #8
0
ファイル: tedbot.py プロジェクト: alexislloyd/tedbot
def getImage(refstring):
	tagged = tag(refstring)

	nouns = [word for word,pos in tagged if pos == 'NNP' or pos == 'NP' or pos == 'NN']
	try:
		query = random.choice(nouns)
	except IndexError:
		#somehow this string has no nouns!
		if DEBUG: print("Paragraph with no nouns:\n" + refstring, file=sys.stderr)
		return None
	
	if DEBUG: print(query, file=sys.stderr)

	flickr = flickrapi.FlickrAPI(flickr_key, flickr_secret, format='parsed-json')
	result = flickr.photos_search(api_key = flickr_key, text = query, privacy_filter = 1, safe_search=1, sort='interestingness-desc', orientation="landscape")
	try:
		pick = random.choice(result['photos']['photo'])
		url = 'https://farm' + str(pick['farm']) + '.staticflickr.com/' + str(pick['server']) + '/' + str(pick['id']) + '_' + str(pick['secret']) + '_z.jpg'
	except IndexError:
		# there were no results, so the random.choice call failed above. This is OK, we'll just move on.
		url = None
	image = {}
	image['url'] = url
	image['noun'] = query
	return image
コード例 #9
0
ファイル: model.py プロジェクト: vm/lessandmore
    def _transform_word(self, word, pos, less, more):
        """transforms a word to be less less and more more

        :param word: word to transform
        :type word: str

        :param pos: part of speech of the word
        :type pos: str

        :param less: list of 'less' words
        :type less: list

        :param more: list of 'more' words
        :type more: list

        :returns: transformed word
        :rtype: str
        """

        new_word = self._get_similar_word(word, less, more)
        new_pos = en.tag(new_word)[0][1]

        if (pos[:2] != new_pos[:2]) or word == new_word:
            return word

        # handle noun
        if pos.startswith('NN'):

            # pluralization
            if pos.endswith('S') and not new_pos.endswith('S'):
                new_word = en.pluralize(new_word)

            elif not pos.endswith('S') and new_pos.endswith('S'):
                new_word = en.singularize(new_word)

            # capitalization
            if word[0].isupper():
                new_word = new_word[0].upper() + new_word[1:]
            else:
                new_word = new_word.lower()

        # handle verb
        elif pos.startswith('VB'):

            tense, person, number = en.tenses(word)[0][:3]

            # conjugation
            conjugated = en.conjugate(new_word,
                                    tense=tense,
                                    person=person,
                                    number=number,
                                    parse=False)

            if conjugated is not None:
                new_word = conjugated

        # remove underscores for joint words
        new_word = new_word.replace('_', ' ')

        return new_word
コード例 #10
0
def getEntities(parser, tweet, xEntities):
	try:
		spacyParsedObject = parser(tweet)
		sentence =  TextBlob(tweet)
		textblobTaggedObject = sentence.parse().split()
		patterntaggedObject = tag(tweet, tokenize=True)
		for word in patterntaggedObject:
			word, wordtag=word
			if  wordtag == "NNP" or  wordtag == "NN" or  wordtag == "PRP":
				v = str(word)
				v = v.strip()
				if(v not in xEntities):	
					xEntities[v]=str(wordtag)						
		for taggedObject in textblobTaggedObject:
			for word in taggedObject:
				word, wordtag=word[0], word[1]
				if wordtag == "NNP" or wordtag == "NN" or wordtag == "PRP":
					v = str(word)
					v = v.strip()
					if(v not in xEntities):	
						xEntities[v]=str(wordtag)
		for word in spacyParsedObject:
			if word.tag_ == "NNP" or word.tag_ == "NN" or word.tag_ == "PRP":
				v = str(word)
				v = v.strip()
				if(v not in xEntities):	
					xEntities[v]=str(word.tag_)
		return xEntities
	except Exception as e:
		return e
		
コード例 #11
0
ファイル: tag-parser.py プロジェクト: d-baker/NaNoGenMo-2014
def parse_pos(source_filename, output_filename, pos):
    wordlist = list(open(source_filename).read().split())
    matched_words = [word for word in wordlist if tag(word)[0][1] == pos]

    fp = open(output_filename, "w")
    for w in matched_words:
        fp.write("%s\n" % w)
    fp.close()
コード例 #12
0
ファイル: denver.py プロジェクト: skyballin/RapAttack
def denver_lyrics(link):
    link = link.replace('..', 'http://www.azlyrics.com/')
    html = urlopen(link).read()
    soup = BeautifulSoup(html)
    lyrics = str(''.join(''.join([s.text for s in soup.findAll('div')]).split('lyrics')[1:]).split('\n\n\n\n\r\nif')[0].strip()).replace('LYRICS', "").replace('JOHN DENVER', '')
    lyrics = [x for x in lyrics.splitlines() if x]
    lyrics = ' '.join([str(x) for x in lyrics if x[0] not in ['[', ' ']])
    exclude = set(string.punctuation)
    lyrics = ''.join(ch.lower() for ch in lyrics if ch not in exclude)
    lyrics = lyrics.split()
    nounlist = []
    for word in lyrics:
        if tag(word)[0][1] in ['NN', 'NNP'] and len(word)>2 and tag(word) not in nounlist:
            nounlist.append(tag(word))
    return nounlist


#http://www.azlyrics.com/n/neildiamond.html
#http://www.azlyrics.com/j/johndenver.html
コード例 #13
0
ファイル: tokenizer.py プロジェクト: luislezcair/gisiaws
	def tokenizer(self,url):
		#page = URL(url).download(user_agent='Mozilla/5')
           page = URL(url).download()
           text = plaintext(page, keep={})
           tokens = tag(text)
		#print tokens
           print len(tokens),' words'
           document=[]
           while tokens:
			document.append(tokens.pop(0)[0])
コード例 #14
0
ファイル: test_parsing.py プロジェクト: byteface/sing
def run(o):

#	https://github.com/clips/pattern/blob/master/examples/03-en/03-parse.py

	import os, sys;# sys.path.insert(0, os.path.join("..", ".."))

	from pattern.en import parse, pprint, tag

	# The en module contains a fast regular expressions-based parser.
	# A parser identifies words in a sentence, word part-of-speech tags (e.g. noun, verb)
	# and groups of words that belong together (e.g. noun phrases).
	# Common part-of-speech tags: NN (noun), VB (verb), JJ (adjective), PP (preposition).
	# A tag can have a suffix, for example NNS (plural noun) or VBG (gerund verb).
	# Overview of tags: http://www.clips.ua.ac.be/pages/mbsp-tags
	s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex"
	s = parse(s,
	     tokenize = True,  # Tokenize the input, i.e. split punctuation from words.
	         tags = True,  # Find part-of-speech tags.
	       chunks = True,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
	    relations = True,  # Find relations between chunks.
	      lemmata = True,  # Find word lemmata.
	        light = False)

	# The light parameter determines how unknown words are handled.
	# By default, unknown words are tagged NN and then improved with a set of rules.
	# light=False uses Brill's lexical and contextual rules,
	# light=True uses a set of custom rules that is less accurate but faster (5x-10x).

	# The output is a string with each sentence on a new line.
	# Words in a sentence have been annotated with tags,
	# for example: fork/NN/I-NP/I-PNP
	# NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase.
	print s
	print

	# Prettier output can be obtained with the pprint() command:
	pprint(s)
	print

	# The string's split() method will (unless a split character is given),
	# split into a list of sentences, where each sentence is a list of words
	# and each word is a list with the word + its tags.
	print s.split()
	print 

	# The tag() command returns a list of (word, POS-tag)-tuples.
	# With light=True, this is the fastest and simplest way to get an idea 
	# of a sentence's constituents:
	s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex"
	s = tag(s)
	print s
	for word, tag in s:
	    if tag == "NN": # Find all nouns in the input string.
	        print word
コード例 #15
0
ファイル: render.py プロジェクト: assamite/cc-codecamp16
 def process(wrd):
     tmp = ''
     ignore_pos = ['IN', 'RP', 'TO']
     exception_lemma = ['flatter', 'flattered']
     if tag(wrd)[0][1] in ignore_pos:
         tmp = wrd
     elif any(wrd in ex_l for ex_l in exception_lemma):
         tmp = wrd
     else:
         tmp = conjugate(wrd, tense=PAST)
     return tmp
コード例 #16
0
ファイル: textcleaner.py プロジェクト: ArifAhmed1995/gensim
def clean_text_by_word(text):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return dict((unit.text, unit) for unit in units)
コード例 #17
0
def extract_pos_feat(text):
    pos_feat = []
    
    for article in text:
        pos_feat.append('~~~\n')
        for sent in article:
            token_tags = tag(sent)
            tags = map(lambda x: x[1], token_tags)
            pos_sent = START + ' '.join(tags) + END
            pos_feat.append(pos_sent)

    return pos_feat
コード例 #18
0
ファイル: textcleaner.py プロジェクト: mctian/textrank-1
def clean_text_by_word(text, language="english", deacc=False):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    init_textcleanner(language)
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, lowercase=True, deacc=deacc))
    filtered_words = filter_words(original_words)
    if HAS_PATTERN:
        tags = tag(" ".join(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return { unit.text : unit for unit in units }
コード例 #19
0
ファイル: nouns.py プロジェクト: KahiniWadhawan/MinasTirith
 def tagging(self, sentence):
     filename = "nouns.txt"
     filename1 = "adjectives.txt"
     tags = tag(sentence)
     #POS tagging to get the required words corresponding to the below Noun and Adjective Tags
     tags_n = [word for word, pos in tags if pos in ['NN','NNS','NNP','NNPS']]
     tags_adj = [word for word, pos in tags if pos in ['JJ','JJS','JJR']]
     #Dumping noun and adjective tags respectively
     with open(filename, 'a') as fp:
         pickle.dump(tags_n, fp)
     fp.close()
     with open(filename1, 'a') as fp:
         pickle.dump(tags_adj, fp)
     fp.close()
コード例 #20
0
ファイル: botlang.py プロジェクト: d-baker/bots
    def pluralify(self, bottish, english, dictionary):
        plural_suffix = "ly"

        for definition in dictionary:
            # if English word is a plural
            if tag(english)[0][1] in ("NNS", "NNPS"):
                # if the singular English word is already defined...
                if singularize(english) == definition["english"]:
                    bottish = definition["bottish"] + plural_suffix
                    return bottish
                # otherwise generate a new plural
                else:
                    bottish = bottish + plural_suffix
                    return bottish

            # if English word is a singular...
            elif tag(english)[0][1] in ("NN", "NNP"):
                # if a plural version is already defined...
                if pluralize(english) == definition["english"]:
                    bottish = definition["bottish"].rstrip(plural_suffix)
                    return bottish

        return bottish
コード例 #21
0
ファイル: emojier_en.py プロジェクト: beeva-labs/emojinews
	def tagLemma(self, word_old):
		#print tag(word_old)
		for word, pos in tag(word_old): 
			if pos=="NNS": #plurales
				x = singularize(word)
			elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo 
				x = conjugate(word, INFINITIVE)
				#To-Do: fix this
				if x: # a veces da error al conjugar
					x = x
				else:
					x = word
			else:
				x = word  
		return x
コード例 #22
0
ファイル: topic_sentiment.py プロジェクト: ahmed26/sentiment
def sentiment(content):
    relevant_types = ["JJ", "VB", "RB"]  # adjectives, verbs, adverbs
    score = 0
    wordnet.sentiment.load()
    synsets = wordnet.synsets
    for word, pos in tag(content):
        if pos in relevant_types:
            try:
                synset = synsets(word, pos)[0].weight
            except KeyError:
                # incorrect part of speech tag
                continue
            ps, ns, os = synset
            score = score + (ps - ns)
    return 1 if score >= 0 else -1
コード例 #23
0
	def get(self, descriptions, mode = "link", index = False, field = False, count = False):
		""" Returns a list of names or connect a list of item to lexicon items
		
		Keyword arguments:
		descriptions	---	EHRI.get() descriptions list
		mode	---	Either link or search
		index	---	Overide self.index
		field ---	Field to query, default is scopeAndContent
		count	---	Overide self.count
		
		"""
		
		if index:
			self.index = index
		if field:
			self.field = field
		if count:
			self.count = count
		
		results = {}
		
			
		#Looping on items
		for description in descriptions:
			for word, pos in tag(description[self.field]):
				
				if pos in ["NN", "NNS", "NNP"]:
					w = singularize(word)
					w = w.lower()
					if w not in results:
						results[w] = 0
					results[w] += 1
					
					# If we are looking for stats about items
					if mode == "link":
						if w.title() in self.available:
							for item in self.lexicon:
								print item
								if w.title() in self.lexicon[item]:
									self.count[item] += 1
									if description[self.identifier] not in self.index:
										self.index[description[self.identifier]] = []
									self.index[description[self.identifier]].append(item)
		if mode == "link":
			return self.index
		elif mode == "search":
			return results
コード例 #24
0
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
コード例 #25
0
def postag_feature_builder(
        text, target_pos=('JJ', 'NN', 'VB', 'NP', 'RB', 'CD')):
    """
        faster version of the tag feature builder
        uses paten.tag instead of paten.parsetree
    """
    if not text:
        return {}
    # tag each word
    try:
        result = patvec.count(
            (word for word,
             tag in paten.tag(text,
                              tokenize=True,
                              encoding='utf-8') if tag in target_pos))
    except IndexError as e:
        print text, e
        result = {}
    return result
コード例 #26
0
def clean_text_by_word(text, language="english"):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    init_textcleanner(language)

    text_without_acronyms = [ replace_with_separator(text[i].text, "", [AB_ACRONYM_LETTERS]) for i in range(len(text)) ]
    original_sentences = [list(tokenize(text_without_acronyms[i], to_lower=True, deacc=True)) for i in range(len(text_without_acronyms))]
    # original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
    original_words = []
    for i, basicSentence in enumerate(original_sentences):
        text[i].basic = u' '.join(basicSentence)
        original_words += basicSentence
    filtered_words = filter_words(original_words)
    if HAS_PATTERN:
        tags = tag(" ".join(original_words)) # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return { unit.text : unit for unit in units }
コード例 #27
0
	def stemming(self,tokens):
		text = " ".join(tokens)
		words = []
		part_of_speech = {}
		part_of_speech['noun'] = ["NN"]
		part_of_speech['verbs'] = ["VB","VBG","VBP","VBZ","VBN","VBD"]
		part_of_speech['plural'] = ["NNS"]
		part_of_speech['adjective'] = ["JJ"]
		for word, pos in tag(text):
			if pos in part_of_speech['noun']:
				word = self.stemmer.stemming(word)
			if pos in part_of_speech["verbs"]:			
				word = lemma(word)
			if pos in part_of_speech['plural']:
				word = singularize(word)
			if pos in part_of_speech['adjective']:
				word = self.stemmer.stemming(word)
			words.append(word)
		return words
コード例 #28
0
ファイル: testing.py プロジェクト: ageek/sentiment
def sentiment(content):
    
    if len(wordnet.sentiment) == 0:
        wordnet.sentiment.load()
        
    relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs
    score = 0
    
    synsets = wordnet.synsets
    for word, pos in tag(content):
            if pos in relevant_types:
                try:
                    synset = synsets(word, pos)[0].weight
                except KeyError:
                    #incorrect part of speech tag
                    continue
                positivity, negativity, objectivity = synset
                score = score + (positivity - negativity) * (1 - objectivity)
                
    return score
コード例 #29
0
    def pos_tag_sentence(self, sentence, verb='^'):
        half_window = (self.window / 2)

        sentence = sentence.decode('utf8', 'replace')
        sentence_pos = pattern.tag(sentence)
        sentence_pos = [e for e in sentence_pos if e[1] in self.pos_tags]
        verb_pos = [v_p for v_p, e in enumerate(sentence_pos) if e[0] == verb][0]
        start_pos, end_pos = verb_pos - half_window, verb_pos + half_window

        start_pos = 0 if start_pos < 0 else start_pos
        word_context = sentence_pos[start_pos: end_pos]

        # pad the first element to form equal length vectors
        if len(word_context) < self.window:
            window_append = [word_context[0]] * (self.window - len(word_context))
            window_append.extend(word_context)
            word_context = window_append

        word_context.pop(half_window)
        return word_context
コード例 #30
0
ファイル: predictor8big.py プロジェクト: dbgannon/sciml
def clean(doc):
    st = ""
    sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 
                'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
                'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
                'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
                'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
                'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
                'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
    for (word, pos) in tag(doc):
        if pos in ["JJ", "NNS", "NN", "NNP"]:
            st = st+word+" "
        else:
            if st!= "":
                st = st[0:-1]+" "
                #print "got one"
    wordl = st.lower().split()
    s = ""
    for word in wordl:
        if word not in sciencestopwords:
            s = s+" "+word
    return s