Beispiel #1
0
 def __search_results(self, page):
     start = time.time()
     if page == 1:
         results = bs(urlopen(baseURL + queryString + self.searchTerm),
                      parseOnlyThese=ss('a', 'result_primary_link'))
     else:
         results = bs(urlopen(baseURL + queryString + self.searchTerm +
                              searchPageString + str(page)),
                      parseOnlyThese=ss('a', 'result_primary_link'))
     for link in results.contents:
         if link['result-type'] == 'Talk' and not link[
                 'href'] in self.listOfPosts:
             Investigator.__result(self, link['href'])
     print "__search_results Elapsed Time: %s" % (
         time.time() - start), self.searchTerm, ' page: ', page
Beispiel #2
0
 def __user(self, user):
     try:
         start = time.time()
         inQueue = Queue()
         outQueue = Queue()
         processes = []
         links = bs(urlopen(baseURL + user + '/activity'),
                    parseOnlyThese=ss('a', href=re.compile('/post/a.')))
         for link in links.contents:
             if link['href'] not in self.visitedPosts:
                 inQueue.put(link['href'])
                 self.visitedPosts.append(link['href'])
         for i in range(cpu_count()):
             p = Process(target=Investigator.__posts,
                         args=(self, inQueue, outQueue))
             p.start()
             processes.append(p)
             inQueue.put('STOP')
         for p in processes:
             p.join()
         outQueue.put('STOP')
         for post in iter(outQueue.get, 'STOP'):
             self.listOfPosts.append(post)
         print "__user Elapsed Time: %s" % (time.time() - start), user
     except HTTPError:
         print 'HTTPError:', user
Beispiel #3
0
def __frequent_words_worker(inqueue, outqueue):
	for post in iter(inqueue.get, 'STOP'):
		try:
			texts = bs(urlopen(baseURL+post), parseOnlyThese = ss('div', 'content'))
			outqueue.put(texts.contents[0].text)
			print 'frequent_words ', post
		except:
			print 'frequent_words Error:', post
			outqueue.put(-1)
Beispiel #4
0
def __frequent_words_worker(inqueue, outqueue):
    for post in iter(inqueue.get, 'STOP'):
        try:
            texts = bs(urlopen(baseURL + post),
                       parseOnlyThese=ss('div', 'content'))
            outqueue.put(texts.contents[0].text)
            print 'frequent_words ', post
        except:
            print 'frequent_words Error:', post
            outqueue.put(-1)
Beispiel #5
0
	def __search_results(self, page):
		start = time.time()
		if page == 1:
			results = bs(urlopen(baseURL + queryString + self.searchTerm), parseOnlyThese = ss('a','result_primary_link'))
		else:
			results = bs(urlopen(baseURL + queryString + self.searchTerm + searchPageString + str(page)), parseOnlyThese = ss('a','result_primary_link'))
		for link in results.contents:
			if link['result-type'] == 'Talk' and not link['href'] in self.listOfPosts:
				Investigator.__result(self, link['href'])
		print "__search_results Elapsed Time: %s" % (time.time() - start), self.searchTerm, ' page: ', page
Beispiel #6
0
 def __result(self, post):
     try:
         pageCount = 1
         while True:
             if pageCount == 1:
                 users = bs(urlopen(baseURL + post),
                            parseOnlyThese=ss('div', 'user_nickname'))
             else:
                 users = bs(urlopen(baseURL + post + postPageString +
                                    str(pageCount)),
                            parseOnlyThese=ss('div', 'user_nickname'))
             if len(users.contents) == 1:
                 break
             for user in users.contents:
                 if user.a['href'] not in self.visitedUsers:
                     self.visitedUsers.append(user.a['href'])
                     Investigator.__user(self, user.a['href'])
             pageCount = pageCount + 1
     except HTTPError:
         print 'HTTPError:', post
Beispiel #7
0
	def __posts(self, inqueue, outqueue):
		for post in iter(inqueue.get, 'STOP'):
			try:
				texts = bs(urlopen(baseURL + post), parseOnlyThese = ss('div', 'post_content'))
				if len(texts.contents) > 1:
					if not texts.contents[0].find(text = re.compile(self.searchTerm)):
						for content in texts.contents[1:]:
							if content.find(text = re.compile(self.searchTerm)):
								outqueue.put(post)
								break
			except HTTPError:
				print 'HTTPError:', post
Beispiel #8
0
 def __posts(self, inqueue, outqueue):
     for post in iter(inqueue.get, 'STOP'):
         try:
             texts = bs(urlopen(baseURL + post),
                        parseOnlyThese=ss('div', 'post_content'))
             if len(texts.contents) > 1:
                 if not texts.contents[0].find(
                         text=re.compile(self.searchTerm)):
                     for content in texts.contents[1:]:
                         if content.find(text=re.compile(self.searchTerm)):
                             outqueue.put(post)
                             break
         except HTTPError:
             print 'HTTPError:', post
Beispiel #9
0
 def find(self, lastvisited):
     new = bs(urlopen(baseURL + '/?all_pf=all-newest#all-active-posts'),
              parseOnlyThese=ss('a', 'post_list_post_link_url'))
     newPosts = []
     for post in new.contents:
         newPosts.append(int(post['href'][7:15]))
     maxPost = max(newPosts)
     post = lastvisited
     while post < maxPost:
         relevance = Condition.__relevance_index(self,
                                                 postString + str(post))
         if relevance != 0:
             self.relevantPosts[postString + str(post)] = relevance
             print postString + str(post), relevance
         post = post + 1
Beispiel #10
0
	def __result(self, post):
		try:
			pageCount = 1
			while True:
				if pageCount == 1:
					users = bs(urlopen(baseURL + post), parseOnlyThese = ss('div', 'user_nickname'))
				else:
					users = bs(urlopen(baseURL + post + postPageString + str(pageCount)), parseOnlyThese = ss('div', 'user_nickname'))
				if len(users.contents) == 1:
					break
				for user in users.contents:
					if user.a['href'] not in self.visitedUsers:
						self.visitedUsers.append(user.a['href'])
						Investigator.__user(self, user.a['href'])
				pageCount = pageCount + 1
		except HTTPError:
			print 'HTTPError:', post
Beispiel #11
0
	def __relevance_index(self, post):
		try:
			originalPost = bs(urlopen(baseURL+post), parseOnlyThese = ss('div', 'content'))
			if len(originalPost.contents) != 0:
				words = re.findall(r'\w+', originalPost.contents[0].text)
				if words.__len__() > 100:
					weightsList = []
					for word in words:
						if word in self.weightsDictionary:
							weightsList.append(self.weightsDictionary[word])
					if len(weightsList) != 0:
						geometricMean = exp((1/float(len(weightsList)))*sum([log(x) for x in weightsList]))
						return geometricMean
					else:
						return 0
				else:
					return 0
			else:
				return 0
		except HTTPError:
			return 0
Beispiel #12
0
 def __relevance_index(self, post):
     try:
         originalPost = bs(urlopen(baseURL + post),
                           parseOnlyThese=ss('div', 'content'))
         if len(originalPost.contents) != 0:
             words = re.findall(r'\w+', originalPost.contents[0].text)
             if words.__len__() > 100:
                 weightsList = []
                 for word in words:
                     if word in self.weightsDictionary:
                         weightsList.append(self.weightsDictionary[word])
                 if len(weightsList) != 0:
                     geometricMean = exp((1 / float(len(weightsList))) *
                                         sum([log(x) for x in weightsList]))
                     return geometricMean
                 else:
                     return 0
             else:
                 return 0
         else:
             return 0
     except HTTPError:
         return 0
Beispiel #13
0
	def __user(self, user):
		try:
			start = time.time()
			inQueue = Queue()
			outQueue = Queue()
			processes = []
			links = bs(urlopen(baseURL + user + '/activity'), parseOnlyThese = ss('a', href = re.compile('/post/a.')))
			for link in links.contents:
				if link['href'] not in self.visitedPosts:
					inQueue.put(link['href'])
					self.visitedPosts.append(link['href'])
			for i in range(cpu_count()):
				p = Process(target = Investigator.__posts, args = (self, inQueue, outQueue))
				p.start()
				processes.append(p)
				inQueue.put('STOP')
			for p in processes:
				p.join()
			outQueue.put('STOP')
			for post in iter(outQueue.get, 'STOP'):
				self.listOfPosts.append(post)
			print "__user Elapsed Time: %s" % (time.time() - start), user
		except HTTPError:
			print 'HTTPError:', user
Beispiel #14
0
	def find(self, lastvisited):
		new = bs(urlopen(baseURL + '/?all_pf=all-newest#all-active-posts'), parseOnlyThese = ss('a', 'post_list_post_link_url'))
		newPosts = []
		for post in new.contents:
			newPosts.append(int(post['href'][7:15]))
		maxPost = max(newPosts)
		post = lastvisited
		while post < maxPost:
			relevance = Condition.__relevance_index(self, postString + str(post))
			if relevance != 0:
				self.relevantPosts[postString + str(post)] = relevance
				print postString + str(post), relevance
			post = post + 1