Ejemplo n.º 1
0
def process(statement, database_name=DATABASE_NAME):
    ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" 
      and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013"
      
      already encountering a statement like "There is a game engine Unity3d" gives us trouble
      seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization
      which doesn't really work for things like CTO as a category of items, hmm
      
      >>> sent = "There is a game engine Unreal Engine".split()
      >>> print nltk.ne_chunk(nltk.pos_tag(sent))
      '''
    # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D"
    # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source"

    s = parse(statement, relations=True, lemmata=True, light=True)
    s = split(s)

    #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s)
    s, result = extract(statement)
    if result:
        #try:
        noun = search('(NN)+', s)[0].string
        table = pluralize(noun.replace(' ', '_'))
        result = search(
            '(JJ|NNPS|NNP)+', s
        )  # this pulls in adjectives, but there's supposed to be a better fix coming
        ident = result[0].string
        name = result[1].string if len(result) > 1 else ident
        #raise Exception(table+"; "+ident+"; "+name)
        return newTable(table, ident, name, database_name)
    #except:
    #return regexMatch(statement,database_name)
    else:
        return regexMatch(statement, database_name)
Ejemplo n.º 2
0
Archivo: faq.py Proyecto: VRDate/twss
def process(statement,database_name = DATABASE_NAME):
  ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" 
      and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013"
      
      already encountering a statement like "There is a game engine Unity3d" gives us trouble
      seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization
      which doesn't really work for things like CTO as a category of items, hmm
      
      >>> sent = "There is a game engine Unreal Engine".split()
      >>> print nltk.ne_chunk(nltk.pos_tag(sent))
      '''
  # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D"
  # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source"

  s = parse(statement, relations=True, lemmata=True, light=True) 
  s = split(s)

  #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s)
  s, result = extract(statement)
  if result:
    #try:
      noun = search('(NN)+', s)[0].string
      table = pluralize(noun.replace(' ','_'))
      result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming
      ident = result[0].string
      name = result[1].string if len(result) > 1 else ident
      #raise Exception(table+"; "+ident+"; "+name)
      return newTable(table,ident,name,database_name)
    #except:
      #return regexMatch(statement,database_name)
  else:
    return regexMatch(statement,database_name)
Ejemplo n.º 3
0
def sentiment(content):
    if len(wordnet.sentiment) == 0:
        wordnet.sentiment.load()
        
    relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs
    score = 0
    sentences = split(parse(content))
    for sentence in sentences:
        for index, word in enumerate(sentence.words):
            if word.string != '' and word.type in relevant_types:              
                try:
                    synset = wordnet.synsets(word.string, word.type)
                except KeyError:
                    #incorrect part of speech tag or not in wordnet, skip it
                    continue
                pos, neg, obj = synset[0].weight
                
                #weights concluding statements
                #idea from [Ohana, Tierney '09]
                documentpos = index / float(len(sentence.words))

                #weights more subjective statements
                subjscore = ((pos - neg) * (1 - obj))
                
                score = score + subjscore * documentpos
    return score
Ejemplo n.º 4
0
def sentiment(content):
    if len(wordnet.sentiment) == 0:
        wordnet.sentiment.load()

    relevant_types = ['JJ', 'VB', 'RB']  #adjectives, verbs, adverbs
    score = 0
    sentences = split(parse(content))
    for sentence in sentences:
        for index, word in enumerate(sentence.words):
            if word.string != '' and word.type in relevant_types:
                try:
                    synset = wordnet.synsets(word.string, word.type)
                except KeyError:
                    #incorrect part of speech tag or not in wordnet, skip it
                    continue
                pos, neg, obj = synset[0].weight

                #weights concluding statements
                #idea from [Ohana, Tierney '09]
                documentpos = index / float(len(sentence.words))

                #weights more subjective statements
                subjscore = ((pos - neg) * (1 - obj))

                score = score + subjscore * documentpos
    return score
Ejemplo n.º 5
0
def sentiment(content):
    wordnet.sentiment.load()
    relevant_types = ['JJ', 'VB', 'RB'] # adjectives, verbs, adverbs
    score = 0
    sentences = split(parse(content, lemmata=True))
    for sentence in sentences:
        for word in sentence.words:
            if word.type in relevant_types:
                pos, neg, obj = wordnet.sentiment[word.lemma]
                score = score + ((pos - neg) * (1 - obj)) # weight subjective words
    return score 
Ejemplo n.º 6
0
def sentiment(content):
    from pattern.en import parse, split, wordnet #must have sentiwordnet available
    wordnet.sentiment.load()
    relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs
    score = 0
    sentences = split(parse(content, lemmata=True))
    for sentence in sentences:
        for word in sentence.words:
            if word.type in relevant_types:
                pos, neg, obj = wordnet.sentiment[word.lemma]
                score = score + ((pos - neg) * (1 - obj)) #weight subjective words heavily
    return 1 if score >=0 else -1
Ejemplo n.º 7
0
def sentiment(content):
    from pattern.en import parse, split, wordnet  #must have sentiwordnet available
    wordnet.sentiment.load()
    relevant_types = ['JJ', 'VB', 'RB']  #adjectives, verbs, adverbs
    score = 0
    sentences = split(parse(content, lemmata=True))
    for sentence in sentences:
        for word in sentence.words:
            if word.type in relevant_types:
                pos, neg, obj = wordnet.sentiment[word.lemma]
                score = score + (
                    (pos - neg) * (1 - obj))  #weight subjective words heavily
    return 1 if score >= 0 else -1
Ejemplo n.º 8
0
def sentiment(content):
    from pattern.en import parse, split, wordnet
    wordnet.sentiment.load()
    relevant_types = [
        'JJ',
        'VB',
        'VBD',
        'VBN',
        'VBG'
        'RB',
    ]
    score = 0
    sentences = split(parse(content, lemmata=True))
    for sentence in sentences:
        for word in sentence.words:
            if word.type in relevant_types:
                pos, neg, obj = wordnet.sentiment[word.lemma]
                score = score + ((pos - neg) * (1 - obj))
    #return 1 if score >= 0 else -1
    return score
Ejemplo n.º 9
0
    def tokenize(self, text):
        """
        Tokenize words in a text and return the relevant ones

        Parameters
        ----------
        text : str
            Text to tokenize.
        """
        for f in self.filters:
            text = f(text)

        words = []
        for s in nlp.split(nlp.parse(text)):
            for word, tag in s.tagged:
                if tag in self.nlp_tags:
                    word = word.lower()
                    if word not in self.exclude_words:
                        words.append(word)

        return words
Ejemplo n.º 10
0
 def test_split(self):
     # Assert split(parse(s)) == Text.
     v = en.split(en.parse("The cat purs."))
     self.assertTrue(isinstance(v, en.Text))
     print("pattern.en.split()")
Ejemplo n.º 11
0
 def test_split(self):
     # Assert split(parse(s)) == Text.
     v = en.split(en.parse("The cat purs."))
     self.assertTrue(isinstance(v, en.Text))
     print "pattern.en.split()"
Ejemplo n.º 12
0
spacy_pos = pd.concat(
    [
        labeled_pos,
        pd.DataFrame({"Spacy_pos_pred": tokens_pos, "Spacy_pos_full_pred": tokens_pos_full, "Spacy_tag": tokens_tag}),
    ],
    axis=1,
)



# %%

# Pattern

s = parse(last_5_sent_full_clean)
s = split(s)

pattern_pos = []

for i in range(len(s)):
    pattern_pos.extend(list(s.sentences[i].pos))

if len(pattern_pos) != labeled_pos.shape[0]:
    print("inconsistency between pattern pos and labeled pos")

#labeled_pos["Pattern_pos_pred"] = pattern_pos

pattern_pos = pd.concat(
    [labeled_pos, pd.DataFrame({"Pattern_pos_pred": pattern_pos})], axis=1
)
Ejemplo n.º 13
0
def main():

    # First two vars hold the number of relevant sentences, the 2 others the float values
    police_killer_i = 0
    police_killed_i = 0
    police_killer_value = 0.0
    police_killed_value = 0.0
    total_sentences = 0

    # Init Twitter query engine
    engine = Twitter(license=None, language='en')
    results_list = []
    print('Performing twitter queries...')

    # 4 differents queries with 100 results each = 400 results
    results_list.append(
        engine.search('policeman kill', start=1, count=100, cached=False))
    results_list.append(
        engine.search('policeman killed', start=1, count=100, cached=False))
    results_list.append(
        engine.search('police kill', start=1, count=100, cached=False))
    results_list.append(
        engine.search('police killed', start=1, count=100, cached=False))

    #print lemma('shot')

    # Open a file to put some recognized examples
    examples_file = open('examples.txt', 'w')

    # For each list of results
    for ii in xrange(len(results_list)):
        print('Starting to analyze query results: ' + str(ii + 1) +
              ' out of ' + str(len(results_list)))
        for res in results_list[ii]:
            # Parse and split the tweet in sentences
            s = parse(string.lower(res.description),
                      chunks=True,
                      relations=True,
                      lemmata=True)
            #s = parse(string.lower(res), chunks=True, relations=True, lemmata=True)
            #pprint(s)

            ss = split(s)

            # Then for each sentence
            for sent in ss:
                # Update sentences number
                total_sentences += 1

                found = False
                i = 0
                value = 0.0

                # First check the affidability of the sentence - .5 if a bad word is found, 1.0 otherwise
                while (not found and (i < len(sent.words))):
                    #print sent.words[i]
                    if (sent.words[i].string in PROFANITY):
                        found = True
                    i = i + 1
                if (found):
                    #print('Found a bad word')
                    value = 0.5
                else:
                    # No bad words found -> giving max affidability value
                    value = 1.0

                #print sent.chunks
                # Here, I want to clear the sentence from the PNP elements. I will filter out the words belonging to PNP from the list
                cleared_sentence_words = filter(lambda (i): i.pnp is None,
                                                sent.words)
                cleared_string = ''

                # But now, seems like there's no way to reconstruct a parsed sentence but assembling again a string and parsing it again
                for word in cleared_sentence_words:
                    cleared_string += ' ' + word.string
                #print cleared_string
                cleared_sentence = parse(cleared_string,
                                         chunks=True,
                                         relations=True,
                                         lemmata=True)
                cleared_sentence = split(cleared_sentence)
                #pprint(cleared_sentence)
                sentence_type1 = False

                # Now cleared sentence is a sentence without PNP
                # Check if it is a standard active sentence
                for match in search('NP kill NP', cleared_sentence):
                    # It is
                    sentence_type1 = True
                    # Check if the Subject is the police
                    if (match.constituents()[0].role == 'SBJ'):
                        for word in match.constituents()[0].words:
                            if word.string in search_list:
                                police_killer_i += 1
                                police_killer_value += value
                                #print('Police killed')
                                # Print to the examples' file the recognized match
                                for sword in match.words:
                                    examples_file.write(
                                        str(sword.string.encode("utf-8")) +
                                        ' ')
                                examples_file.write('\r\n')
                                #examples_file.write(str(match.words)+'\r\n');
                                examples_file.write(
                                    '   Recognized as: police killed somebody'
                                    + '\r\n')
                                examples_file.write(
                                    '   TYPE: ACTIVE - SUBJECT' + '\r\n')
                                examples_file.write('\r\n')

                    if (len(match.constituents()) > 2):
                        # Or check if it is object
                        if (match.constituents()[2].role == 'OBJ'):
                            for word in match.constituents()[2].words:
                                if word.string in search_list:
                                    police_killed_i += 1
                                    police_killed_value += value
                                    #print('Killed by police')
                                    # Print to the example file the recognized match
                                    for sword in match.words:
                                        examples_file.write(
                                            str(sword.string.encode("utf-8")) +
                                            ' ')
                                    examples_file.write('\r\n')
                                    examples_file.write(
                                        '   Recognized as: police killed by somebody'
                                        + '\r\n')
                                    examples_file.write(
                                        '   TYPE: ACTIVE - OBJECT' + '\r\n')
                                    examples_file.write('\r\n')

                # If it was not an active sentence, check if it is a passive one
                if (not sentence_type1):
                    #print('Try type 2')
                    for match in search('NP kill (PP)+ (NP)+',
                                        cleared_sentence):
                        # Here, the problem is that match.constituents returns a mixed list which can contain both Chunks and Words
                        # We are interested in tags, hence in Chunks, hence we need to do some isinstance non-safe tricks
                        # Checking the subject
                        if (isinstance(match.constituents()[0], Chunk)):
                            if (match.constituents()[0].role == 'SBJ'):
                                #print('Is subject')
                                for word in match.constituents()[0]:
                                    #for word in match.chunks()[0]:
                                    if word.string in search_list:
                                        police_killer_i += 1
                                        police_killer_value += value
                                        # Print to the example file the recognized match
                                        for sword in match.words:
                                            examples_file.write(
                                                str(
                                                    sword.string.encode(
                                                        "utf-8")) + ' ')
                                        examples_file.write('\r\n')
                                        examples_file.write(
                                            '   Recognized as: police killed somebody'
                                            + '\r\n')
                                        examples_file.write(
                                            '   TYPE: PASSIVE - SUBJECT - CHUNK'
                                            + '\r\n')
                                        examples_file.write('\r\n')

                        elif (isinstance(match.constituents()[0], Word)):
                            if match.constituents()[0].string in search_list:
                                police_killer_i += 1
                                police_killer_value += value
                                #print('Killed by police')
                                # Print to the example file the recognized match
                                for sword in match.words:
                                    examples_file.write(
                                        str(sword.string.encode("utf-8")) +
                                        ' ')
                                examples_file.write('\r\n')
                                examples_file.write(
                                    '   Recognized as: police killed somebody'
                                    + '\r\n')
                                examples_file.write(
                                    '   TYPE: PASSIVE - SUBJECT - WORD' +
                                    '\r\n')
                                examples_file.write('\r\n')

                        # Checking the object. First I have to filter out the Word objects from the match results to see if I have enough Chunks
                        if (len(
                                filter(lambda (i): isinstance(i, Chunk),
                                       match.constituents())) == 4):
                            if (match.constituents()[3].role == 'OBJ'):
                                for word in match.constituents()[3]:
                                    if word.string in search_list:
                                        police_killed_i += 1
                                        police_killed_value += value
                                        # Print to the example file the recognized match
                                        for sword in match.words:
                                            examples_file.write(
                                                str(
                                                    sword.string.encode(
                                                        "utf-8")) + ' ')
                                        examples_file.write('\r\n')
                                        examples_file.write(
                                            '   Recognized as: police was killed by somebody'
                                            + '\r\n')
                                        examples_file.write(
                                            '   TYPE: PASSIVE - OBJECT - CHUNK'
                                            + '\r\n')
                                        examples_file.write('\r\n')
def main():

	# First two vars hold the number of relevant sentences, the 2 others the float values
	police_killer_i = 0
	police_killed_i = 0
	police_killer_value = 0.0
	police_killed_value = 0.0
	total_sentences = 0

	# Init Twitter query engine
	engine = Twitter(license=None, language='en')
	results_list = []
	print('Performing twitter queries...')

	# 4 differents queries with 100 results each = 400 results
	results_list.append(engine.search('policeman kill', start=1, count=100, cached=False))
	results_list.append(engine.search('policeman killed', start=1, count=100, cached=False))
	results_list.append(engine.search('police kill', start=1, count=100, cached=False))
	results_list.append(engine.search('police killed', start=1, count=100, cached=False))

	#print lemma('shot')

	# Open a file to put some recognized examples
	examples_file = open('examples.txt', 'w')

	# For each list of results
	for ii in xrange(len(results_list)):
		print('Starting to analyze query results: '+str(ii+1) + ' out of '+str(len(results_list)))
		for res in results_list[ii]:
			# Parse and split the tweet in sentences
			s = parse(string.lower(res.description), chunks=True, relations=True, lemmata=True)
			#s = parse(string.lower(res), chunks=True, relations=True, lemmata=True)
			#pprint(s)
			
			ss = split(s)

			# Then for each sentence
			for sent in ss:
				# Update sentences number
				total_sentences += 1

				found = False
				i = 0
				value = 0.0

				# First check the affidability of the sentence - .5 if a bad word is found, 1.0 otherwise
				while (not found and (i < len(sent.words))):
					#print sent.words[i]
					if (sent.words[i].string in PROFANITY):
						found = True
					i = i+1
				if (found):
					#print('Found a bad word')
					value = 0.5
				else:
					# No bad words found -> giving max affidability value
					value = 1.0

				#print sent.chunks
				# Here, I want to clear the sentence from the PNP elements. I will filter out the words belonging to PNP from the list
				cleared_sentence_words = filter( lambda(i): i.pnp is None, sent.words)
				cleared_string = '';

				# But now, seems like there's no way to reconstruct a parsed sentence but assembling again a string and parsing it again
				for word in cleared_sentence_words:
					cleared_string += ' ' + word.string
				#print cleared_string
				cleared_sentence = parse(cleared_string, chunks=True, relations=True, lemmata=True)
				cleared_sentence = split(cleared_sentence)
				#pprint(cleared_sentence)
				sentence_type1 = False

				# Now cleared sentence is a sentence without PNP
				# Check if it is a standard active sentence
				for match in search('NP kill NP', cleared_sentence):
					# It is
					sentence_type1 = True
					# Check if the Subject is the police
					if (match.constituents()[0].role == 'SBJ'):
						for word in match.constituents()[0].words:
							if word.string in search_list:
								police_killer_i += 1
								police_killer_value += value
								#print('Police killed')
								# Print to the examples' file the recognized match
								for sword in match.words:
									examples_file.write(str(sword.string.encode("utf-8"))+' ')
								examples_file.write('\r\n')
								#examples_file.write(str(match.words)+'\r\n');
								examples_file.write('   Recognized as: police killed somebody'+'\r\n')
								examples_file.write('   TYPE: ACTIVE - SUBJECT'+'\r\n')
								examples_file.write('\r\n')

					if (len(match.constituents()) > 2):
						# Or check if it is object
						if (match.constituents()[2].role == 'OBJ'):
							for word in match.constituents()[2].words:
								if word.string in search_list:
									police_killed_i += 1
									police_killed_value += value
									#print('Killed by police')
									# Print to the example file the recognized match
									for sword in match.words:
										examples_file.write(str(sword.string.encode("utf-8"))+' ')
									examples_file.write('\r\n')
									examples_file.write('   Recognized as: police killed by somebody'+'\r\n')
									examples_file.write('   TYPE: ACTIVE - OBJECT'+'\r\n')
									examples_file.write('\r\n')

				# If it was not an active sentence, check if it is a passive one
				if (not sentence_type1):
					#print('Try type 2')
					for match in search('NP kill (PP)+ (NP)+', cleared_sentence):
						# Here, the problem is that match.constituents returns a mixed list which can contain both Chunks and Words
						# We are interested in tags, hence in Chunks, hence we need to do some isinstance non-safe tricks
						# Checking the subject
						if (isinstance(match.constituents()[0], Chunk)):
							if (match.constituents()[0].role == 'SBJ'):
								#print('Is subject')
								for word in match.constituents()[0]:
								#for word in match.chunks()[0]:
									if word.string in search_list:
										police_killer_i += 1
										police_killer_value += value
										# Print to the example file the recognized match
										for sword in match.words:
											examples_file.write(str(sword.string.encode("utf-8"))+' ')
										examples_file.write('\r\n')
										examples_file.write('   Recognized as: police killed somebody'+'\r\n');
										examples_file.write('   TYPE: PASSIVE - SUBJECT - CHUNK'+'\r\n')
										examples_file.write('\r\n');

						elif (isinstance(match.constituents()[0], Word)):
							if match.constituents()[0].string in search_list:
								police_killer_i += 1
								police_killer_value += value
								#print('Killed by police')
								# Print to the example file the recognized match
								for sword in match.words:
									examples_file.write(str(sword.string.encode("utf-8"))+' ')
								examples_file.write('\r\n')
								examples_file.write('   Recognized as: police killed somebody'+'\r\n')
								examples_file.write('   TYPE: PASSIVE - SUBJECT - WORD'+'\r\n')
								examples_file.write('\r\n')

						# Checking the object. First I have to filter out the Word objects from the match results to see if I have enough Chunks
						if (len( filter(lambda(i): isinstance(i, Chunk), match.constituents())) == 4):
							if (match.constituents()[3].role == 'OBJ'):
								for word in match.constituents()[3]:
									if word.string in search_list:
										police_killed_i += 1
										police_killed_value += value
										# Print to the example file the recognized match
										for sword in match.words:
											examples_file.write(str(sword.string.encode("utf-8"))+' ')
										examples_file.write('\r\n')
										examples_file.write('   Recognized as: police was killed by somebody'+'\r\n');
										examples_file.write('   TYPE: PASSIVE - OBJECT - CHUNK'+'\r\n')
										examples_file.write('\r\n');