Example #1
0
def getThread(thread,board=None):
  url = ""
  if thread is None:
    raise Exception("getThread must be passed a threadId or URL.")
  # Check if argument is already a URL
  result = re.search("boards.4chan.org/(.+)/res/(.+)", thread)
  if result is not None:
    url = thread
  # If not, generate the URL
  else:
    if board is not None:
      # Remove slashes, if any
      if board[0] == "/":
        board = board[1:]
      if board[-1] == "/":
        board = board[:-1]
      url = "http://boards.4chan.org/%s/res/%s" % (board, thread)
    else:
      raise Exception("If getThread is given a threadId, \
                       then a board name must also be given")
  # Now we have a URL to download the thread
  pageData = downloader.downloadUrl(url)
  # Make some soup
  soup = BeautifulSoup(pageData)
  posts = soup.find_all("div", "post")
  posts = map(Post, posts) #Turn them into post objcets
  return Thread(posts)
    def __process_url(self):
        while(True):
            url = ""

            with self.__urlList_lock:
                if len(self.__urlList) > 0:
                    url = self.__urlList[0]
                    self.__urlList = self.__urlList[1:]
                else:
                    if self.__linksDone == True:
                        break

            if url == "":
                time.sleep(0.1)
                continue

            if self.__trace:
                with self.__general_lock:
                    print("Parsing", url)

            content_type = None
            data = None
            content_type, data = downloadUrl(url)

            if self.__trace:
                with self.__general_lock:
                    print("Content-Type = \"%s\", len = %d" % (content_type, len(data)))
          
            if content_type == "null":
                continue

            if content_type in FileDownloader.APP_EXT:
                self.__urls.append(url)
                continue

            if "text" in content_type:
                try:
                    data = data.decode('utf-8')
                except:
                    pass
                    continue
                
                url_regexp = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
                matches = re.findall(url_regexp, data)
            
                for link in matches:
                    for ext in FileDownloader.DATA_EXT:
                        if link.endswith(ext):
                            if self.__trace:
                                with self.__general_lock:
                                    print("Found \"%s\" file by extention \"%s\"" % (link, ext))
                                with self.__urls_lock:
                                    self.__urls.append(link)
                                break

                if self.__trace:
                    with self.__general_lock:
                        print("finished ", url)
Example #3
0
def getThread(thread, board=None):
  url = ""
  if thread is None:
    raise Exception("getThread must be passed a threadId or URL.")
  # Check if argument is already a URL
  result = re.search("boards.4chan.org/(.+)/res/(.+)", thread)
  if result is not None:
    thread += ".json"
    url = thread
  # If not, generate the URL
  else:
    if board is not None:
      url = "https://boards.4chan.org/%s/res/%s.json" % (board, thread)
    else:
      raise Exception("If getThread is given a threadId, \
                       then a board name must also be given")
  # Now we have a URL to download the thread
  pageData = downloader.downloadUrl(url)
  return json.loads(pageData)
Example #4
0
    def __fetch_page(self, page):
        searchPattern = GoogleSearch.SEARCH_URL
        if page > 0:
            searchPattern = GoogleSearch.NEXT_PAGE

        searchUrl = [searchPattern % 
                     {
                      'query': urllib.parse.quote_plus(self.query),
                      'start': page * self.num,
                      'tld' : self.tld,
                      'lang' : self.lang
                      }
                     ]

        searchUrl = "".join(searchUrl)

        content_type, data = downloadUrl(searchUrl)
        if "text" in content_type:
            data = data.decode('utf-8')
        return data
Example #5
0
def readRemoteFile(url):
    content_type, remoteFile = downloadUrl(url)
    
    res = None

    if 'text' in content_type:
        res = readTxtFile(io.StringIO(remoteFile.decode('utf-8')))
    elif 'application/pdf' in content_type:
        res = readPdfFile(io.BytesIO(remoteFile))
    if res == None:
        return {'status' : 'error'}

    text = res['data']
    keywords = extractPlainKeywords(text)
    if keywords != None:
        res['keywords'] = keywords
    keyword_expr = extractPlainKeywordsExpressions(text)
    if keyword_expr != None:
        res['keywords_expressions'] = keyword_expr
    emails = extractEmail(text)
    if emails != None:
        res['emails'] = emails
    return res