Ejemplo n.º 1
0
 def test_tag(self):
     # Assert [("black", "JJ"), ("cats", "NNS")].
     v = en.tag("black cats")
     self.assertEqual(v, [("black", "JJ"), ("cats", "NNS")])
     v = en.tag("")
     self.assertEqual(v, [])
     print "pattern.en.tag()"
	def get(self, description, index = False, field = False, debug = False, fields = False):
		"""Get automaticly authorities (All proper nouns) from descriptions, links them to said descriptions and return an index of links and authorities
		
		Keyword arguments:
		description	---	Either a description from an EHRI.get() descriptions list or an EHRI.get() descriptions list
		index	---	Index of items, if already exists
		field ---	Field to query, default is scopeAndContent
		debug ---	Debug mode : print details during execution
		fields	---	If more than one field
		
		"""
		if index:
			self.index = index
		if field:
			self.field = field
		if debug:
			self.debug = debug
		#If Description is a list of description, then we run a loop on it
		if isinstance(description, list):
			for element in description:
				self.get(element, fields = fields)
		else:
			if self.debug:
				print "Handling Item Id " + description[self.identifier]
			
			try:
				if fields:
					tokens = tag(". ".join([description[item] for item in description if item in fields]))
				else:
					tokens = tag(description[self.field])
			except:
				print "Tokenization failed for " + self.field
				sys.exit()

			i = 0
			entities = []
			while i < len(tokens):
				#Setting up temp variables
				name, pos = tokens[i]
				z = 1
				if pos == "NNP":
					entity_name = name
					#if tokens[i+z]:
					if i + z + 1 < len(tokens):
						while tokens[i+z][1] == "NNP" or (z + i + 1 < len(tokens) and tokens[i+z][0].lower() == "of" and tokens[i+z+1][1] == "NNP") :
							entity_name += " " + tokens[i+z][0]
							z += 1
							#Breaking it if not anymore in  index range
							if z + i == len(tokens):
								break
					self.index["authorities"].append(entity_name)
					if description["idDoc"] not in self.index["items"]:
						self.index["items"][description["idDoc"]] = []
					self.index["items"][description["idDoc"]].append(entity_name)
				i += z
		
		return self.index
Ejemplo n.º 3
0
    def transform(self, text, less, more):
        """transforms a body of text to have less of less and more of more!

        :param text: text to transform
        :type text: str

        :param less: list of 'less' words
        :type less: list

        :param more: list of 'more' words
        :type more: list

        :returns: transformed text
        :rtype: str
        """

        last_was_article = False
        new_text = []

        less = [l for l in less if not self._ignore(l, en.tag(l)[0][1])]
        more = [m for m in more if not self._ignore(m, en.tag(m)[0][1])]

        # iterate over words
        for word, pos in en.tag(text):

            if word not in self.model or self._ignore(word, pos):
                if self._is_punc(pos):
                    new_text.append(u'\b' + word)
                else:
                    new_text.append(word)
            else:
                new_word = self._transform_word(word, pos, less, more)

                # handle 'a' v. 'an'
                if new_text and new_text[-1] in ['a', 'an']:
                    new_text[-1] = 'an' if new_word[0] in 'aeiou' else 'a'

                new_text.append(new_word)

        ret = ''

        # remove at backspaces - this is dumb
        for t in new_text:
            if t.startswith('\b'):
                ret += t[1:]
            else:
                ret += ((' ' + t) if ret != '' else t)

        return ret
Ejemplo n.º 4
0
    def create_description(self):
        pat = 'VB|VBD|VBZ|VBG * NN IN * NN'
        #pat = 'PRP * VB|VBD|VBZ|VBG * NN'
        phrases = search.search_out(self.source_text, pat)
        conjugated_phrases = []
        for phrase in phrases:
            words = []
            for word, pos in tag(phrase):
                if pos in ["VBZ", "VBD", "VB", "VBG"]:
                    words.append(conjugate(word, "3sg"))
                #elif pos == "NN" and random.random() < .1:
                    #words.append(self.define_word(word))
                else:
                    words.append(word)
            conjugated_phrases.append(' '.join(words))

        artifacts = list(self.artifacts)

        sentence_prefixes = ["The present invention", "The device", "The invention"]
        paragraph_prefixes = ["The present invention", "According to a beneficial embodiment, the invention", "According to another embodiment, the device", "According to a preferred embodiment, the invention", "In accordance with an alternative specific embodiment, the present invention"] 
        i = 0
        self.description = ''
        for phrase in conjugated_phrases:
            line = ""
            if i == 0:
                line = paragraph_prefixes[0] + " " + phrase
            else:
                if random.random() < .1:
                    line = "\n\n" + random.choice(paragraph_prefixes) + " " + phrase
                else:
                    line = random.choice(sentence_prefixes) + " " + phrase
            self.description += line + ". "
            i += 1
Ejemplo n.º 5
0
    def posify(self, bottish, english, dictionary):
        # TODO incomplete - add more tags!
        tags = {
            ("NN", "NNS"): "noun",
            ("NNP", "NNPS"): "prop. noun",
            "CC": "conj.",
            "DT": "det.",
            "JJ": "adj.",
            ("PRP", "PRP$"): "pers. pron",
            ("RB", "RBS"): "adv.",
            "UH": "interj.",
            ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): "verb"
        }

        pos = ""
        for key in tags.keys():
            if tag(english)[0][1] in key:
                pos = tags[key]

        if pos == "noun":
            bottish = self.pluralify(bottish, english, dictionary)
        elif pos == "adj.":
            bottish = self.adjectify(bottish, english, dictionary)
        #elif pos == "verb":
        #    bottish = self.conjugify(bottish, english, dictionary)

        return [bottish, pos]
Ejemplo n.º 6
0
def getEntities(parser, tweet, xEntities):
	try:
		spacyParsedObject = parser(tweet)
		sentence =  TextBlob(tweet)
		textblobTaggedObject = sentence.parse().split()
		patterntaggedObject = tag(tweet, tokenize=True)
		for word in patterntaggedObject:
			word, wordtag=word
			if  wordtag == "NNP" or  wordtag == "NN" or  wordtag == "PRP":
				v = str(word)
				v = v.strip()
				if(v not in xEntities):	
					xEntities[v]=str(wordtag)						
		for taggedObject in textblobTaggedObject:
			for word in taggedObject:
				word, wordtag=word[0], word[1]
				if wordtag == "NNP" or wordtag == "NN" or wordtag == "PRP":
					v = str(word)
					v = v.strip()
					if(v not in xEntities):	
						xEntities[v]=str(wordtag)
		for word in spacyParsedObject:
			if word.tag_ == "NNP" or word.tag_ == "NN" or word.tag_ == "PRP":
				v = str(word)
				v = v.strip()
				if(v not in xEntities):	
					xEntities[v]=str(word.tag_)
		return xEntities
	except Exception as e:
		return e
		
Ejemplo n.º 7
0
def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    >>> from gensim.summarization.textcleaner import clean_text_by_word
    >>> clean_text_by_word("God helps those who help themselves")
    {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
    'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
    'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}
Ejemplo n.º 8
0
def getImage(refstring):
	tagged = tag(refstring)

	nouns = [word for word,pos in tagged if pos == 'NNP' or pos == 'NP' or pos == 'NN']
	try:
		query = random.choice(nouns)
	except IndexError:
		#somehow this string has no nouns!
		if DEBUG: print("Paragraph with no nouns:\n" + refstring, file=sys.stderr)
		return None
	
	if DEBUG: print(query, file=sys.stderr)

	flickr = flickrapi.FlickrAPI(flickr_key, flickr_secret, format='parsed-json')
	result = flickr.photos_search(api_key = flickr_key, text = query, privacy_filter = 1, safe_search=1, sort='interestingness-desc', orientation="landscape")
	try:
		pick = random.choice(result['photos']['photo'])
		url = 'https://farm' + str(pick['farm']) + '.staticflickr.com/' + str(pick['server']) + '/' + str(pick['id']) + '_' + str(pick['secret']) + '_z.jpg'
	except IndexError:
		# there were no results, so the random.choice call failed above. This is OK, we'll just move on.
		url = None
	image = {}
	image['url'] = url
	image['noun'] = query
	return image
Ejemplo n.º 9
0
def buildVectorizer(bio):
    nounlist = []
    for doc in bio:
        st = ""
        for (word, pos) in tag(doc):
            if pos in ["JJ", "NNS", "NN", "NNP"]:
                st = st+word+" "
            else:
                if st!= "":
                    st = st[0:-1]+" "
                    #print "got one"
        nounlist.extend([st])
    sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 
                    'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
                    'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
                    'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
                    'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
                    'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
                    'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
    #now doing the new vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    english = nltk.corpus.stopwords.words('english')
    newstop = english+list(sciencestopwords) 
    vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore')
    X = vectorizer.fit_transform(nounlist)
    Xinv = vectorizer.inverse_transform(X)
        #X is a sparse matrix of docs x vocab size (7638). 
    #so X[doc_num] is the sparse vector of its words. 
    #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros.
    #Xinv[doc_num] is the list of words in the doc.
     
    return nounlist, vectorizer, X, Xinv
Ejemplo n.º 10
0
    def _transform_word(self, word, pos, less, more):
        """transforms a word to be less less and more more

        :param word: word to transform
        :type word: str

        :param pos: part of speech of the word
        :type pos: str

        :param less: list of 'less' words
        :type less: list

        :param more: list of 'more' words
        :type more: list

        :returns: transformed word
        :rtype: str
        """

        new_word = self._get_similar_word(word, less, more)
        new_pos = en.tag(new_word)[0][1]

        if (pos[:2] != new_pos[:2]) or word == new_word:
            return word

        # handle noun
        if pos.startswith('NN'):

            # pluralization
            if pos.endswith('S') and not new_pos.endswith('S'):
                new_word = en.pluralize(new_word)

            elif not pos.endswith('S') and new_pos.endswith('S'):
                new_word = en.singularize(new_word)

            # capitalization
            if word[0].isupper():
                new_word = new_word[0].upper() + new_word[1:]
            else:
                new_word = new_word.lower()

        # handle verb
        elif pos.startswith('VB'):

            tense, person, number = en.tenses(word)[0][:3]

            # conjugation
            conjugated = en.conjugate(new_word,
                                    tense=tense,
                                    person=person,
                                    number=number,
                                    parse=False)

            if conjugated is not None:
                new_word = conjugated

        # remove underscores for joint words
        new_word = new_word.replace('_', ' ')

        return new_word
Ejemplo n.º 11
0
def parse_pos(source_filename, output_filename, pos):
    wordlist = list(open(source_filename).read().split())
    matched_words = [word for word in wordlist if tag(word)[0][1] == pos]

    fp = open(output_filename, "w")
    for w in matched_words:
        fp.write("%s\n" % w)
    fp.close()
Ejemplo n.º 12
0
def denver_lyrics(link):
    link = link.replace('..', 'http://www.azlyrics.com/')
    html = urlopen(link).read()
    soup = BeautifulSoup(html)
    lyrics = str(''.join(''.join([s.text for s in soup.findAll('div')]).split('lyrics')[1:]).split('\n\n\n\n\r\nif')[0].strip()).replace('LYRICS', "").replace('JOHN DENVER', '')
    lyrics = [x for x in lyrics.splitlines() if x]
    lyrics = ' '.join([str(x) for x in lyrics if x[0] not in ['[', ' ']])
    exclude = set(string.punctuation)
    lyrics = ''.join(ch.lower() for ch in lyrics if ch not in exclude)
    lyrics = lyrics.split()
    nounlist = []
    for word in lyrics:
        if tag(word)[0][1] in ['NN', 'NNP'] and len(word)>2 and tag(word) not in nounlist:
            nounlist.append(tag(word))
    return nounlist


#http://www.azlyrics.com/n/neildiamond.html
#http://www.azlyrics.com/j/johndenver.html
Ejemplo n.º 13
0
	def tokenizer(self,url):
		#page = URL(url).download(user_agent='Mozilla/5')
           page = URL(url).download()
           text = plaintext(page, keep={})
           tokens = tag(text)
		#print tokens
           print len(tokens),' words'
           document=[]
           while tokens:
			document.append(tokens.pop(0)[0])
Ejemplo n.º 14
0
 def process(wrd):
     tmp = ''
     ignore_pos = ['IN', 'RP', 'TO']
     exception_lemma = ['flatter', 'flattered']
     if tag(wrd)[0][1] in ignore_pos:
         tmp = wrd
     elif any(wrd in ex_l for ex_l in exception_lemma):
         tmp = wrd
     else:
         tmp = conjugate(wrd, tense=PAST)
     return tmp
Ejemplo n.º 15
0
def run(o):

#	https://github.com/clips/pattern/blob/master/examples/03-en/03-parse.py

	import os, sys;# sys.path.insert(0, os.path.join("..", ".."))

	from pattern.en import parse, pprint, tag

	# The en module contains a fast regular expressions-based parser.
	# A parser identifies words in a sentence, word part-of-speech tags (e.g. noun, verb)
	# and groups of words that belong together (e.g. noun phrases).
	# Common part-of-speech tags: NN (noun), VB (verb), JJ (adjective), PP (preposition).
	# A tag can have a suffix, for example NNS (plural noun) or VBG (gerund verb).
	# Overview of tags: http://www.clips.ua.ac.be/pages/mbsp-tags
	s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex"
	s = parse(s,
	     tokenize = True,  # Tokenize the input, i.e. split punctuation from words.
	         tags = True,  # Find part-of-speech tags.
	       chunks = True,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
	    relations = True,  # Find relations between chunks.
	      lemmata = True,  # Find word lemmata.
	        light = False)

	# The light parameter determines how unknown words are handled.
	# By default, unknown words are tagged NN and then improved with a set of rules.
	# light=False uses Brill's lexical and contextual rules,
	# light=True uses a set of custom rules that is less accurate but faster (5x-10x).

	# The output is a string with each sentence on a new line.
	# Words in a sentence have been annotated with tags,
	# for example: fork/NN/I-NP/I-PNP
	# NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase.
	print s
	print

	# Prettier output can be obtained with the pprint() command:
	pprint(s)
	print

	# The string's split() method will (unless a split character is given),
	# split into a list of sentences, where each sentence is a list of words
	# and each word is a list with the word + its tags.
	print s.split()
	print 

	# The tag() command returns a list of (word, POS-tag)-tuples.
	# With light=True, this is the fastest and simplest way to get an idea 
	# of a sentence's constituents:
	s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex"
	s = tag(s)
	print s
	for word, tag in s:
	    if tag == "NN": # Find all nouns in the input string.
	        print word
Ejemplo n.º 16
0
def extract_pos_feat(text):
    pos_feat = []
    
    for article in text:
        pos_feat.append('~~~\n')
        for sent in article:
            token_tags = tag(sent)
            tags = map(lambda x: x[1], token_tags)
            pos_sent = START + ' '.join(tags) + END
            pos_feat.append(pos_sent)

    return pos_feat
Ejemplo n.º 17
0
def clean_text_by_word(text):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return dict((unit.text, unit) for unit in units)
Ejemplo n.º 18
0
def clean_text_by_word(text, language="english", deacc=False):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    init_textcleanner(language)
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, lowercase=True, deacc=deacc))
    filtered_words = filter_words(original_words)
    if HAS_PATTERN:
        tags = tag(" ".join(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return { unit.text : unit for unit in units }
Ejemplo n.º 19
0
    def pluralify(self, bottish, english, dictionary):
        plural_suffix = "ly"

        for definition in dictionary:
            # if English word is a plural
            if tag(english)[0][1] in ("NNS", "NNPS"):
                # if the singular English word is already defined...
                if singularize(english) == definition["english"]:
                    bottish = definition["bottish"] + plural_suffix
                    return bottish
                # otherwise generate a new plural
                else:
                    bottish = bottish + plural_suffix
                    return bottish

            # if English word is a singular...
            elif tag(english)[0][1] in ("NN", "NNP"):
                # if a plural version is already defined...
                if pluralize(english) == definition["english"]:
                    bottish = definition["bottish"].rstrip(plural_suffix)
                    return bottish

        return bottish
Ejemplo n.º 20
0
 def tagging(self, sentence):
     filename = "nouns.txt"
     filename1 = "adjectives.txt"
     tags = tag(sentence)
     #POS tagging to get the required words corresponding to the below Noun and Adjective Tags
     tags_n = [word for word, pos in tags if pos in ['NN','NNS','NNP','NNPS']]
     tags_adj = [word for word, pos in tags if pos in ['JJ','JJS','JJR']]
     #Dumping noun and adjective tags respectively
     with open(filename, 'a') as fp:
         pickle.dump(tags_n, fp)
     fp.close()
     with open(filename1, 'a') as fp:
         pickle.dump(tags_adj, fp)
     fp.close()
Ejemplo n.º 21
0
	def tagLemma(self, word_old):
		#print tag(word_old)
		for word, pos in tag(word_old): 
			if pos=="NNS": #plurales
				x = singularize(word)
			elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo 
				x = conjugate(word, INFINITIVE)
				#To-Do: fix this
				if x: # a veces da error al conjugar
					x = x
				else:
					x = word
			else:
				x = word  
		return x
Ejemplo n.º 22
0
def sentiment(content):
    relevant_types = ["JJ", "VB", "RB"]  # adjectives, verbs, adverbs
    score = 0
    wordnet.sentiment.load()
    synsets = wordnet.synsets
    for word, pos in tag(content):
        if pos in relevant_types:
            try:
                synset = synsets(word, pos)[0].weight
            except KeyError:
                # incorrect part of speech tag
                continue
            ps, ns, os = synset
            score = score + (ps - ns)
    return 1 if score >= 0 else -1
Ejemplo n.º 23
0
	def get(self, descriptions, mode = "link", index = False, field = False, count = False):
		""" Returns a list of names or connect a list of item to lexicon items
		
		Keyword arguments:
		descriptions	---	EHRI.get() descriptions list
		mode	---	Either link or search
		index	---	Overide self.index
		field ---	Field to query, default is scopeAndContent
		count	---	Overide self.count
		
		"""
		
		if index:
			self.index = index
		if field:
			self.field = field
		if count:
			self.count = count
		
		results = {}
		
			
		#Looping on items
		for description in descriptions:
			for word, pos in tag(description[self.field]):
				
				if pos in ["NN", "NNS", "NNP"]:
					w = singularize(word)
					w = w.lower()
					if w not in results:
						results[w] = 0
					results[w] += 1
					
					# If we are looking for stats about items
					if mode == "link":
						if w.title() in self.available:
							for item in self.lexicon:
								print item
								if w.title() in self.lexicon[item]:
									self.count[item] += 1
									if description[self.identifier] not in self.index:
										self.index[description[self.identifier]] = []
									self.index[description[self.identifier]].append(item)
		if mode == "link":
			return self.index
		elif mode == "search":
			return results
def postag_feature_builder(
        text, target_pos=('JJ', 'NN', 'VB', 'NP', 'RB', 'CD')):
    """
        faster version of the tag feature builder
        uses paten.tag instead of paten.parsetree
    """
    if not text:
        return {}
    # tag each word
    try:
        result = patvec.count(
            (word for word,
             tag in paten.tag(text,
                              tokenize=True,
                              encoding='utf-8') if tag in target_pos))
    except IndexError as e:
        print text, e
        result = {}
    return result
Ejemplo n.º 25
0
def clean_text_by_word(text, language="english"):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    init_textcleanner(language)

    text_without_acronyms = [ replace_with_separator(text[i].text, "", [AB_ACRONYM_LETTERS]) for i in range(len(text)) ]
    original_sentences = [list(tokenize(text_without_acronyms[i], to_lower=True, deacc=True)) for i in range(len(text_without_acronyms))]
    # original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
    original_words = []
    for i, basicSentence in enumerate(original_sentences):
        text[i].basic = u' '.join(basicSentence)
        original_words += basicSentence
    filtered_words = filter_words(original_words)
    if HAS_PATTERN:
        tags = tag(" ".join(original_words)) # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return { unit.text : unit for unit in units }
	def stemming(self,tokens):
		text = " ".join(tokens)
		words = []
		part_of_speech = {}
		part_of_speech['noun'] = ["NN"]
		part_of_speech['verbs'] = ["VB","VBG","VBP","VBZ","VBN","VBD"]
		part_of_speech['plural'] = ["NNS"]
		part_of_speech['adjective'] = ["JJ"]
		for word, pos in tag(text):
			if pos in part_of_speech['noun']:
				word = self.stemmer.stemming(word)
			if pos in part_of_speech["verbs"]:			
				word = lemma(word)
			if pos in part_of_speech['plural']:
				word = singularize(word)
			if pos in part_of_speech['adjective']:
				word = self.stemmer.stemming(word)
			words.append(word)
		return words
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
Ejemplo n.º 28
0
def sentiment(content):
    
    if len(wordnet.sentiment) == 0:
        wordnet.sentiment.load()
        
    relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs
    score = 0
    
    synsets = wordnet.synsets
    for word, pos in tag(content):
            if pos in relevant_types:
                try:
                    synset = synsets(word, pos)[0].weight
                except KeyError:
                    #incorrect part of speech tag
                    continue
                positivity, negativity, objectivity = synset
                score = score + (positivity - negativity) * (1 - objectivity)
                
    return score
Ejemplo n.º 29
0
    def pos_tag_sentence(self, sentence, verb='^'):
        half_window = (self.window / 2)

        sentence = sentence.decode('utf8', 'replace')
        sentence_pos = pattern.tag(sentence)
        sentence_pos = [e for e in sentence_pos if e[1] in self.pos_tags]
        verb_pos = [v_p for v_p, e in enumerate(sentence_pos) if e[0] == verb][0]
        start_pos, end_pos = verb_pos - half_window, verb_pos + half_window

        start_pos = 0 if start_pos < 0 else start_pos
        word_context = sentence_pos[start_pos: end_pos]

        # pad the first element to form equal length vectors
        if len(word_context) < self.window:
            window_append = [word_context[0]] * (self.window - len(word_context))
            window_append.extend(word_context)
            word_context = window_append

        word_context.pop(half_window)
        return word_context
Ejemplo n.º 30
0
def clean(doc):
    st = ""
    sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 
                'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
                'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
                'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
                'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
                'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
                'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
    for (word, pos) in tag(doc):
        if pos in ["JJ", "NNS", "NN", "NNP"]:
            st = st+word+" "
        else:
            if st!= "":
                st = st[0:-1]+" "
                #print "got one"
    wordl = st.lower().split()
    s = ""
    for word in wordl:
        if word not in sciencestopwords:
            s = s+" "+word
    return s
Ejemplo n.º 31
0
    def restructCaption(self, pred_caption, subject_set, object_set,
                        pred_relations):
        total_rel = [0 for i in range(len(subject_set.keys()))]

        skey = list(subject_set.keys())

        for_tagging_caption = pred_caption.lower()
        splitted_caption = for_tagging_caption.split()
        next_tag_of_subject_set = {}
        verb_noun_tag = ["VB", "VBP", "VBZ", "VBG", "VBD", "VBN", "NN"]
        person_tag = ["woman", "man", "people"]
        for subject in skey:
            for i in range(len(splitted_caption)):
                if subject == splitted_caption[i] or (subject == "person"
                                                      and splitted_caption[i]
                                                      in person_tag):
                    next_word = i + 1

                    if next_word < len(splitted_caption):
                        if Logging:
                            print("[restructCaption] next_word is '%s'" %
                                  (splitted_caption[next_word]))

                        for word, pos in tag(splitted_caption[next_word]):
                            if Logging:
                                print(
                                    "[restructCaption] the pos of next word is '%s'"
                                    % (pos))

                            if pos in verb_noun_tag:
                                next_tag_of_subject_set[subject] = True
                                if Logging:
                                    print(
                                        "[restructCaption] next_tag(VERB OR NOUN?) is appended: '%s'"
                                        % (subject + ' : ' +
                                           splitted_caption[next_word]))
                            else:
                                next_tag_of_subject_set[subject] = False

                        break

                    else:
                        next_tag_of_subject_set[subject] = False
                else:
                    next_tag_of_subject_set[subject] = False

        for i in range(len(skey)):
            if Logging:
                print("[restructCaption] the subject key is '%s'" % (skey[i]))

            for j in range(len(subject_set[skey[i]])):
                if Logging:
                    print("[restructCaption] next_tag is '%s'" %
                          (next_tag_of_subject_set[skey[i]]))

                if j == 0:
                    # total_rel[i] = pred_relations[subject_set[skey[i]][j]][0] + ' ' + pred_relations[subject_set[skey[i]][j]][1] + ' ' + pred_relations[subject_set[skey[i]][j]][2]
                    total_rel[i] = pred_relations[subject_set[skey[i]][j]][
                        1] + ' ' + pred_relations[subject_set[skey[i]][j]][2]

                    if len(subject_set[skey[i]]
                           ) == 1 and next_tag_of_subject_set[skey[i]]:
                        total_rel[i] = total_rel[i] + ' and'
                    continue

                total_rel[i] = total_rel[i] + ' and ' + pred_relations[
                    subject_set[skey[i]][j]][1] + ' ' + pred_relations[
                        subject_set[skey[i]][j]][2]
                if next_tag_of_subject_set[skey[i]]:
                    total_rel[i] = total_rel[i] + ' and'

        if Logging:
            print("[restructCaption] total phrase is '%s'" % (total_rel))

        # Insert part
        final_caption = pred_caption
        splitted_final_caption = final_caption.split()
        pop_relations_index = []
        for i in range(len(skey)):
            # final_caption = final_caption.replace(skey[i], total_rel[i])
            if skey[i] == "person":
                if "man" in splitted_final_caption:
                    final_caption = final_caption.replace(
                        "man", "man" + ' ' + total_rel[i])
                elif "woman" in splitted_final_caption:
                    final_caption = final_caption.replace(
                        "woman", "woman" + ' ' + total_rel[i])
                elif "people" in splitted_final_caption:
                    final_caption = final_caption.replace(
                        "people", "people" + ' ' + total_rel[i])
                else:
                    index_list = subject_set[skey[i]]
                    # print(index_list)
                    for j in range(len(index_list)):
                        pop_relations_index.append(index_list[j])
            else:
                if skey[i] in splitted_final_caption:
                    final_caption = final_caption.replace(
                        skey[i], skey[i] + ' ' + total_rel[i])
                else:
                    index_list = subject_set[skey[i]]
                    # print(index_list)
                    for j in range(len(index_list)):
                        pop_relations_index.append(index_list[j])
                    # print(pop_relations_index)

        used_relations = []
        for i in range(len(pred_relations)):
            if i in pop_relations_index:
                continue
            else:
                used_relations.append(pred_relations[i])

        if Logging:
            print("[restructCaption] final caption is '%s'" % (final_caption))
            print("[restructCaption] final used relations are '%s'" %
                  (used_relations))

        # print(used_relations)

        return final_caption, used_relations
Ejemplo n.º 32
0
    if inputfile.endswith('/') == False:
        inputfile += '/'
    srts = [
        inputfile + f for f in os.listdir(inputfile)
        if f.lower().endswith('srt')
    ]

for srt in srts:
    f = open(srt, 'r')
    for line in f:
        if line.find('-->') == -1:
            text += line
    f.close()

text = re.sub(r'^\d+[\n\r]', '', text, flags=re.MULTILINE)
tags = tag(text)
pos = [t[1] for t in tags]
ngrams = {}
n = int(sys.argv[2])

for i in range(len(pos) - n + 1):
    gram = tuple(pos[i:i + n])
    if gram in ngrams:
        ngrams[gram] += 1
    else:
        ngrams[gram] = 1

for ngram in sorted(ngrams, key=ngrams.get, reverse=True):
    count = ngrams[ngram]
    if count > 4:
        print ' '.join(ngram) + ": " + str(count)
Ejemplo n.º 33
0
 def tag(self, tokens):
     # don't import at top since don't want to fail if not installed
     from pattern.en import tag
     # not tokenizing ensures that the number of tagged tokens returned is
     # the same as the number of input tokens
     return tag(u' '.join(tokens), tokenize=False)
Ejemplo n.º 34
0
        f = open(path + genreFolder + "/" + files, "r")
        plotText = f.read()
        f.close()

        plotText = re.sub('<!--.*?>.*?-->', '', plotText, 0, re.I | re.S)
        plotText = re.sub('<.+?>', '', plotText, 0, re.I | re.S)

        sentList = [
            x.replace("\n", " ")
            for x in nltk.sent_tokenize(plotText.replace("\t", ""))
        ]

        for strSentence in sentList:
            #print(strSentence)
            for word, pos in tag(strSentence):
                if pos in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"):
                    word = str(lemma(word))
                    if (word not in ("be", "do", "let", "begin", "have", "try",
                                     "start")):
                        verbList.append(word)

            a = parse(strSentence, relations=True, lemmata=True)
            #pprint(a)

            sentence = Sentence(a)
            for i in range(0, len(sentence.verbs) - 1):
                strVP = str(' '.join(sentence.verbs[i].lemmata))
                vpList.append(strVP)

            #print(sentence.relations)
Ejemplo n.º 35
0
# By default, unknown words are tagged NN and then improved with a set of rules.
# light=False uses Brill's lexical and contextual rules,
# light=True uses a set of custom rules that is less accurate but faster (5x-10x).

# The output is a string with each sentence on a new line.
# Words in a sentence have been annotated with tags,
# for example: fork/NN/I-NP/I-PNP
# NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase.
print s
print

# Prettier output can be obtained with the pprint() command:
pprint(s)
print

# The string's split() method will (unless a split character is given),
# split into a list of sentences, where each sentence is a list of words
# and each word is a list with the word + its tags.
print s.split()
print

# The tag() command returns a list of (word, POS-tag)-tuples.
# With light=True, this is the fastest and simplest way to get an idea
# of a sentence's constituents:
s = "I eat pizza with a fork."
s = tag(s, light=True)
print s
for word, tag in s:
    if tag == "NN":  # Find all nouns in the input string.
        print word
Ejemplo n.º 36
0
	(first iteration)
	1st line: contain 9 syllables
	2nd line: contain 8 syllables 
	3rd line: contain 7 syllables 
	...
	9th line: contain 1 syllable

"""

from pattern.en import parsetree
from pattern.en import tag
from pattern.en import pprint

def word_eval(string)
	pprint(parsetree(string, relations = True))
	for word, pos in tag(string):
		if pos == "NN":
			print word

def gutenberg_text_gather(current_URL):

	from pattern.web import *
	buddhist_psalm_text = URL(current_URL).download()
	print buddhist_psalm_text

	# Save data to a file (will be part of your data fetching script)
	f = open('buddhist_psalm_text.pickle','w')
	pickle.dump(all_texts,f)
	f.close()

	# Load data from a file (will be part of your data processing script)
Ejemplo n.º 37
0
visualize_sentence_tree(t)

from nltk.corpus import treebank_chunk

data = treebank_chunk.chunked_sents()
train_data = data[:4000]
test_data = data[4000:]
print
train_data[7]

simple_sentence = 'the quick fox jumped over the lazy dog'

from nltk.chunk import RegexpParser
from pattern.en import tag

tagged_simple_sent = tag(simple_sentence)
print
tagged_simple_sent

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print
c

chink_grammar = """
NP: {<.*>+} # chunk everything as NP
}<VBD|IN>+{
"""
Ejemplo n.º 38
0
# By default, unknown words are tagged NN and then improved with a set of rules.
# light=False uses Brill's lexical and contextual rules,
# light=True uses a set of custom rules that is less accurate but faster (5x-10x).

# The output is a string with each sentence on a new line.
# Words in a sentence have been annotated with tags,
# for example: fork/NN/I-NP/I-PNP
# NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase.
print s
print

# Prettier output can be obtained with the pprint() command:
pprint(s)
print

# The string's split() method will (unless a split character is given),
# split into a list of sentences, where each sentence is a list of words
# and each word is a list with the word + its tags.
print s.split()
print

# The tag() command returns a list of (word, POS-tag)-tuples.
# With light=True, this is the fastest and simplest way to get an idea
# of a sentence's constituents:
s = "I eat pizza with a fork."
s = tag(s)
print s
for word, tag in s:
    if tag == "NN":  # Find all nouns in the input string.
        print word
Ejemplo n.º 39
0
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred')  # 'p' in tenses() also works.
print(PAST, 1, PL) in tenses('purred')

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)

#parse
s = parse('I eat pizza with a fork.')
pprint(s)

#tag
for word, t in tag('The cat felt happy.'):
    print word + ' is ' + t

s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring."
print sentiment(s)
print polarity(s)
print subjectivity(s)

#The modality() function returns a value between -1.0 and +1.0, expressing the degree of certainty
s2 = "Some amino acids tend to be acidic while others may be basic."  # weaseling
se = Sentence(parse(s, chunks=False, lemmata=True))
print modality(se)
Ejemplo n.º 40
0
from pattern.web import Twitter
from pattern.en import tag
from pattern.vector import KNN, count

twitter, knn = Twitter(), KNN()

for i in range(1, 3):
    for tweet in twitter.search('#win OR #fail', start=i, count=100):
        s = tweet.text.lower()
        p = '#win' in s and 'WIN' or 'FAIL'
        v = tag(s)
        v = [word for word, pos in v if pos == 'JJ']  # JJ = adjective
        v = count(v)  # {'sweet': 1}
        if v:
            knn.train(v, type=p)

print(knn.classify('sweet potato burger'))
print(knn.classify('stupid autocorrect'))
Ejemplo n.º 41
0
                count += 1
            except:
                nevermind = 1
    writer.close()
    print "[+] saved to files. Saved", count, "words."


# ----------------------------

rawjson = open("archetypes_settings_corpora.json").read(
)  #puts the file as a big string into the variable rawjson
data = json.loads(
    rawjson)  #json.loads take a string and turns it into a data structure
for elem in data["settings"]:
    w = elem["name"]
    pos = tag(w)[-1][1]
    # print "-"*20
    # print w, pos
    add_word(w, pos)

    if pos.startswith("VB") and Word(w).lemmatize('v') is not w:
        w = Word(w).lemmatize('v')
        pos = tag("to " + w)[-1][1]
        # print "-"*5
        # print w, pos
        add_word(w, pos)
    if pos.startswith("NN") and Word(w).lemmatize('n') is not w:
        w = Word(w).lemmatize('n')
        pos = tag(w)[-1][1]
        # print "-"*5
        # print w, pos