def googleSearch(): # # AUTHOR: Kristin Funch searchTerm = raw_input('Query: ') #DEBATE debateQuery = searchTerm + ' site:www.debate.org/opinions' query = urllib.urlencode ( { 'q' : debateQuery } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results3 = json1 [ 'responseData' ] [ 'results' ] for x in range(0, 1): url = results3[x]['url'] top2 = bagOfWords(url, 0, searchTerm)[3] print top2 printDebate(url) #WIKI query = searchTerm + ' site:en.wikipedia.org' query = urllib.urlencode ( { 'q' : query } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results = json1 [ 'responseData' ] [ 'results' ] for x in range(0, 1): url = results[x]['url'] top2 = bagOfWords(url, 0, searchTerm)[3] print top2 printWikipedia(url) #REDDIT redditQuery = searchTerm + ' site:www.reddit.com/r/self' query = urllib.urlencode ( { 'q' : redditQuery } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results2 = json1 [ 'responseData' ] [ 'results' ] #for x in range(0, len(results2)): personalStoryRatings = [] for x in range(0, len(results2)): url = results2[x]['url'] bagOfWordsResults = bagOfWords(url, 1, searchTerm) personalStoryRatings.append(bagOfWordsResults[1]) top2 = bagOfWordsResults[3] print top2 print personalStoryRatings printReddit(results2[personalStoryRatings.index(max(personalStoryRatings))]['url'])
countHeadlineLength(X_train, y_train) countRealFake(y_train, y_test) # loo = LeaveOneOut() # print(loo.get_n_splits(X_train)) # print(loo) # for train_index, test_index in loo.split(X_train.to_frame(), y_train.to_frame()): # print("Train:", train_index, "Test:", test_index) # X_train, X_test = X_train[train_index], X_train[test_index] # y_train, y_test = y_train[train_index], y_test[test_index] # print(X_train, X_test, y_train, y_test) exportTestTrain(X_train, X_test, y_train, y_test) count_vect, X_train_counts = bagOfWords(X_train, False, True, True) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) print(X_train_tf.shape) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) print(X_train_tfidf.shape) text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
def googleSearch(): # # AUTHOR: Kristin Funch print '\n\n' searchTerm = raw_input('Query: ') print '\n\n' topRelatedWords = [] #DEBATE debateQuery = searchTerm + ' site:www.debate.org/opinions' query = urllib.urlencode ( { 'q' : debateQuery } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results3 = json1 [ 'responseData' ] [ 'results' ] for x in range(0, 1): url = results3[x]['url'] top2 = bagOfWords(url, 0, searchTerm)[3] topRelatedWords.append(top2[0]) topRelatedWords.append(top2[1]) print bagOfWordsByParagraph(url)[0] print '\n\n' #WIKI query = searchTerm + ' site:en.wikipedia.org' query = urllib.urlencode ( { 'q' : query } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results = json1 [ 'responseData' ] [ 'results' ] for x in range(0, 1): url = results[x]['url'] top2 = bagOfWords(url, 0, searchTerm)[3] topRelatedWords.append(top2[0]) topRelatedWords.append(top2[1]) print bagOfWordsByParagraph(url)[0] print '\n\n' #REDDIT redditQuery = searchTerm + ' site:www.reddit.com/r/self' query = urllib.urlencode ( { 'q' : redditQuery } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results2 = json1 [ 'responseData' ] [ 'results' ] personalStoryRatings = [] for x in range(0, len(results2)): url = results2[x]['url'] bagOfWordsResults = bagOfWords(url, 1, searchTerm) personalStoryRatings.append(bagOfWordsResults[1]) top2 = bagOfWordsResults[3] topRelatedWords.append(top2[0]) topRelatedWords.append(top2[1]) #print personalStoryRatings print bagOfWordsByParagraph(results2[personalStoryRatings.index(max(personalStoryRatings))]['url'])[1] print '\n\n' waPoQuery = searchTerm + ' solution site:www.washingtonpost.com/opinions/' query = urllib.urlencode ( { 'q' : waPoQuery } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results4 = json1 [ 'responseData' ] [ 'results' ] solutionRatings = [] for x in range(0, len(results4)): url = results4[x]['url'] bagOfWordsResults = bagOfWords(url, 0, searchTerm) solutionRatings.append(bagOfWordsResults[2]) top2 = bagOfWordsResults[3] topRelatedWords.append(top2[0]) topRelatedWords.append(top2[1]) if (solutionRatings != []): print bagOfWordsByParagraph(results4[solutionRatings.index(max(solutionRatings))]['url'])[2] print topRelatedWords
from data.getData import * from bagOfWords import bagOfWords from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer words = bagOfWords(200).getAllWords() vec = CountVectorizer() tvec = TfidfTransformer(use_idf=False) x = vec.fit_transform(words) tvec.fit(x) max = max(list(x.indices)) def myBag(s): t = vec.transform(s.split(" ")) a = tvec.transform(t) return a
def googleSearch(): # # AUTHOR: Kristin Funch print '\n\n' searchTerm = raw_input('Query: ') print '\n\n' topRelatedWords = [] #PROBLEM query = "\"when I first heard about\" " + searchTerm query = urllib.urlencode ( { 'q' : query } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results4 = json1 [ 'responseData' ] [ 'results' ] paragraphArr = [] cleanParagraphArr = [] for x in range(0, len(results4)): url = results4[x]['url'] paragraphs = findSentence(url, '', 'when I first heard about', '') for x in range(0, len(paragraphs[0])): paragraphArr.append(paragraphs[0][x]) cleanParagraphArr.append(paragraphs[1][x]) print paragraphArr[bagOfWordsGivenParagraphs(cleanParagraphArr)[1]] print '\n' #PROBLEM query = "\"the problem with " + searchTerm + " is\"" query = urllib.urlencode ( { 'q' : query } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results4 = json1 [ 'responseData' ] [ 'results' ] paragraphArr = [] cleanParagraphArr = [] for x in range(0, len(results4)): url = results4[x]['url'] paragraphs = findSentence(url, searchTerm, 'the problem with ', ' is') for x in range(0, len(paragraphs[0])): paragraphArr.append(paragraphs[0][x]) cleanParagraphArr.append(paragraphs[1][x]) print paragraphArr[bagOfWordsGivenParagraphs(cleanParagraphArr)[0]] print '\n' #WIKI query = searchTerm + ' site:en.wikipedia.org' query = urllib.urlencode ( { 'q' : query } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results = json1 [ 'responseData' ] [ 'results' ] for x in range(0, 1): url = results[x]['url'] top2 = bagOfWords(url, 0, searchTerm)[3] topRelatedWords.append(top2[0]) topRelatedWords.append(top2[1]) print bagOfWordsByParagraph(url)[0] print '\n' #SOLUTION query = "\"solution to " + searchTerm + "\"" query = urllib.urlencode ( { 'q' : query } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json1 = json.loads ( response ) results4 = json1 [ 'responseData' ] [ 'results' ] paragraphArr = [] cleanParagraphArr = [] for x in range(0, len(results4)): url = results4[x]['url'] paragraphs = findSentence(url, searchTerm, 'solution to ', '') for x in range(0, len(paragraphs[0])): paragraphArr.append(paragraphs[0][x]) cleanParagraphArr.append(paragraphs[1][x]) print paragraphArr[bagOfWordsGivenParagraphs(cleanParagraphArr)[2]] print '\n'