def getThread(thread,board=None): url = "" if thread is None: raise Exception("getThread must be passed a threadId or URL.") # Check if argument is already a URL result = re.search("boards.4chan.org/(.+)/res/(.+)", thread) if result is not None: url = thread # If not, generate the URL else: if board is not None: # Remove slashes, if any if board[0] == "/": board = board[1:] if board[-1] == "/": board = board[:-1] url = "http://boards.4chan.org/%s/res/%s" % (board, thread) else: raise Exception("If getThread is given a threadId, \ then a board name must also be given") # Now we have a URL to download the thread pageData = downloader.downloadUrl(url) # Make some soup soup = BeautifulSoup(pageData) posts = soup.find_all("div", "post") posts = map(Post, posts) #Turn them into post objcets return Thread(posts)
def __process_url(self): while(True): url = "" with self.__urlList_lock: if len(self.__urlList) > 0: url = self.__urlList[0] self.__urlList = self.__urlList[1:] else: if self.__linksDone == True: break if url == "": time.sleep(0.1) continue if self.__trace: with self.__general_lock: print("Parsing", url) content_type = None data = None content_type, data = downloadUrl(url) if self.__trace: with self.__general_lock: print("Content-Type = \"%s\", len = %d" % (content_type, len(data))) if content_type == "null": continue if content_type in FileDownloader.APP_EXT: self.__urls.append(url) continue if "text" in content_type: try: data = data.decode('utf-8') except: pass continue url_regexp = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' matches = re.findall(url_regexp, data) for link in matches: for ext in FileDownloader.DATA_EXT: if link.endswith(ext): if self.__trace: with self.__general_lock: print("Found \"%s\" file by extention \"%s\"" % (link, ext)) with self.__urls_lock: self.__urls.append(link) break if self.__trace: with self.__general_lock: print("finished ", url)
def getThread(thread, board=None): url = "" if thread is None: raise Exception("getThread must be passed a threadId or URL.") # Check if argument is already a URL result = re.search("boards.4chan.org/(.+)/res/(.+)", thread) if result is not None: thread += ".json" url = thread # If not, generate the URL else: if board is not None: url = "https://boards.4chan.org/%s/res/%s.json" % (board, thread) else: raise Exception("If getThread is given a threadId, \ then a board name must also be given") # Now we have a URL to download the thread pageData = downloader.downloadUrl(url) return json.loads(pageData)
def __fetch_page(self, page): searchPattern = GoogleSearch.SEARCH_URL if page > 0: searchPattern = GoogleSearch.NEXT_PAGE searchUrl = [searchPattern % { 'query': urllib.parse.quote_plus(self.query), 'start': page * self.num, 'tld' : self.tld, 'lang' : self.lang } ] searchUrl = "".join(searchUrl) content_type, data = downloadUrl(searchUrl) if "text" in content_type: data = data.decode('utf-8') return data
def readRemoteFile(url): content_type, remoteFile = downloadUrl(url) res = None if 'text' in content_type: res = readTxtFile(io.StringIO(remoteFile.decode('utf-8'))) elif 'application/pdf' in content_type: res = readPdfFile(io.BytesIO(remoteFile)) if res == None: return {'status' : 'error'} text = res['data'] keywords = extractPlainKeywords(text) if keywords != None: res['keywords'] = keywords keyword_expr = extractPlainKeywordsExpressions(text) if keyword_expr != None: res['keywords_expressions'] = keyword_expr emails = extractEmail(text) if emails != None: res['emails'] = emails return res