Python bagOfWords 예제들, bagOfWords.bagOfWords Python 예제들

예제 #1

0

파일 보기

파일: demoWeek6.py 프로젝트: kfunch/EECS338

def googleSearch():
#
#   AUTHOR: Kristin Funch
	
	searchTerm = raw_input('Query: ')


	#DEBATE
	debateQuery = searchTerm + ' site:www.debate.org/opinions'
	query = urllib.urlencode ( { 'q' : debateQuery } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results3 = json1 [ 'responseData' ] [ 'results' ]

	for x in range(0, 1):
		url = results3[x]['url']
		top2 = bagOfWords(url, 0, searchTerm)[3]
		print top2
		printDebate(url)

	#WIKI
	query = searchTerm + ' site:en.wikipedia.org'
	query = urllib.urlencode ( { 'q' : query } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results = json1 [ 'responseData' ] [ 'results' ]

	for x in range(0, 1):
		url = results[x]['url']
		top2 = bagOfWords(url, 0, searchTerm)[3]
		print top2
		printWikipedia(url)

	#REDDIT
	redditQuery = searchTerm + ' site:www.reddit.com/r/self'
	query = urllib.urlencode ( { 'q' : redditQuery } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results2 = json1 [ 'responseData' ] [ 'results' ]
	#for x in range(0, len(results2)):

	personalStoryRatings = []
	for x in range(0, len(results2)):
		url = results2[x]['url']
		bagOfWordsResults = bagOfWords(url, 1, searchTerm)
		personalStoryRatings.append(bagOfWordsResults[1])
		top2 = bagOfWordsResults[3]
		print top2
		
	print personalStoryRatings
	printReddit(results2[personalStoryRatings.index(max(personalStoryRatings))]['url'])

예제 #2

0

파일 보기

countHeadlineLength(X_train, y_train)

countRealFake(y_train, y_test)

# loo = LeaveOneOut()
# print(loo.get_n_splits(X_train))
# print(loo)
# for train_index, test_index in loo.split(X_train.to_frame(), y_train.to_frame()):
#     print("Train:", train_index, "Test:", test_index)
#     X_train, X_test = X_train[train_index], X_train[test_index]
#     y_train, y_test = y_train[train_index], y_test[test_index]
#     print(X_train, X_test, y_train, y_test)

exportTestTrain(X_train, X_test, y_train, y_test)

count_vect, X_train_counts = bagOfWords(X_train, False, True, True)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf.shape)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

예제 #3

0

파일 보기

파일: demoWeek7.py 프로젝트: kfunch/EECS338

def googleSearch():
#
#   AUTHOR: Kristin Funch
	print '\n\n'
	searchTerm = raw_input('Query: ')
	print '\n\n'
	topRelatedWords = []

	#DEBATE
	debateQuery = searchTerm + ' site:www.debate.org/opinions'
	query = urllib.urlencode ( { 'q' : debateQuery } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results3 = json1 [ 'responseData' ] [ 'results' ]

	for x in range(0, 1):
		url = results3[x]['url']
		top2 = bagOfWords(url, 0, searchTerm)[3]
		topRelatedWords.append(top2[0])
		topRelatedWords.append(top2[1])
		print bagOfWordsByParagraph(url)[0]
		print '\n\n'

	#WIKI
	query = searchTerm + ' site:en.wikipedia.org'
	query = urllib.urlencode ( { 'q' : query } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results = json1 [ 'responseData' ] [ 'results' ]

	for x in range(0, 1):
		url = results[x]['url']
		top2 = bagOfWords(url, 0, searchTerm)[3]
		topRelatedWords.append(top2[0])
		topRelatedWords.append(top2[1])
		print bagOfWordsByParagraph(url)[0]
		print '\n\n'

	#REDDIT
	redditQuery = searchTerm + ' site:www.reddit.com/r/self'
	query = urllib.urlencode ( { 'q' : redditQuery } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results2 = json1 [ 'responseData' ] [ 'results' ]

	personalStoryRatings = []
	for x in range(0, len(results2)):
		url = results2[x]['url']
		bagOfWordsResults = bagOfWords(url, 1, searchTerm)
		personalStoryRatings.append(bagOfWordsResults[1])
		top2 = bagOfWordsResults[3]
		topRelatedWords.append(top2[0])
		topRelatedWords.append(top2[1])
		
	#print personalStoryRatings
	print bagOfWordsByParagraph(results2[personalStoryRatings.index(max(personalStoryRatings))]['url'])[1]
	print '\n\n'
	
	waPoQuery = searchTerm + ' solution site:www.washingtonpost.com/opinions/'
	query = urllib.urlencode ( { 'q' : waPoQuery } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results4 = json1 [ 'responseData' ] [ 'results' ]

	solutionRatings = []
	for x in range(0, len(results4)):
		url = results4[x]['url']
		bagOfWordsResults = bagOfWords(url, 0, searchTerm)
		solutionRatings.append(bagOfWordsResults[2])
		top2 = bagOfWordsResults[3]
		topRelatedWords.append(top2[0])
		topRelatedWords.append(top2[1])

	if (solutionRatings != []):
		print bagOfWordsByParagraph(results4[solutionRatings.index(max(solutionRatings))]['url'])[2]
	print topRelatedWords

예제 #4

0

파일 보기

파일: omerBestFeatures.py 프로젝트: sharpblade4/imlhack

from data.getData import *
from bagOfWords import bagOfWords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

words = bagOfWords(200).getAllWords()
vec = CountVectorizer()
tvec = TfidfTransformer(use_idf=False)
x = vec.fit_transform(words)
tvec.fit(x)
max = max(list(x.indices))

def myBag(s):
    t = vec.transform(s.split(" "))
    a = tvec.transform(t)
    return a

예제 #5

0

파일 보기

파일: demoWeek9.py 프로젝트: kfunch/EECS338

def googleSearch():
#
#   AUTHOR: Kristin Funch
	print '\n\n'
	searchTerm = raw_input('Query: ')
	print '\n\n'
	topRelatedWords = []

	#PROBLEM
	query = "\"when I first heard about\" " + searchTerm
	query = urllib.urlencode ( { 'q' : query } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results4 = json1 [ 'responseData' ] [ 'results' ]

	paragraphArr = []
	cleanParagraphArr = []
	for x in range(0, len(results4)):
		url = results4[x]['url']
		paragraphs = findSentence(url, '', 'when I first heard about', '')
		for x in range(0, len(paragraphs[0])):
			paragraphArr.append(paragraphs[0][x])
			cleanParagraphArr.append(paragraphs[1][x])
	print paragraphArr[bagOfWordsGivenParagraphs(cleanParagraphArr)[1]]
	print '\n'

	#PROBLEM
	query = "\"the problem with " + searchTerm + " is\""
	query = urllib.urlencode ( { 'q' : query } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results4 = json1 [ 'responseData' ] [ 'results' ]

	paragraphArr = []
	cleanParagraphArr = []
	for x in range(0, len(results4)):
		url = results4[x]['url']
		paragraphs = findSentence(url, searchTerm, 'the problem with ', ' is')
		for x in range(0, len(paragraphs[0])):
			paragraphArr.append(paragraphs[0][x])
			cleanParagraphArr.append(paragraphs[1][x])
	print paragraphArr[bagOfWordsGivenParagraphs(cleanParagraphArr)[0]]
	print '\n'

	#WIKI
	query = searchTerm + ' site:en.wikipedia.org'
	query = urllib.urlencode ( { 'q' : query } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results = json1 [ 'responseData' ] [ 'results' ]

	for x in range(0, 1):
		url = results[x]['url']
		top2 = bagOfWords(url, 0, searchTerm)[3]
		topRelatedWords.append(top2[0])
		topRelatedWords.append(top2[1])
		print bagOfWordsByParagraph(url)[0]
		print '\n'

	#SOLUTION
	query = "\"solution to " + searchTerm + "\""
	query = urllib.urlencode ( { 'q' : query } )
	response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	json1 = json.loads ( response )
	results4 = json1 [ 'responseData' ] [ 'results' ]

	paragraphArr = []
	cleanParagraphArr = []
	for x in range(0, len(results4)):
		url = results4[x]['url']
		paragraphs = findSentence(url, searchTerm, 'solution to ', '')
		for x in range(0, len(paragraphs[0])):
			paragraphArr.append(paragraphs[0][x])
			cleanParagraphArr.append(paragraphs[1][x])
	print paragraphArr[bagOfWordsGivenParagraphs(cleanParagraphArr)[2]]
	print '\n'