Esempio n. 1
0
    def getLoginUrl(self, callbackToken):
        """

        """
        callback_url = utils.getDomain() + 'account/linked/facebook/login_callback.json'
        permissions = 'user_about_me,user_location,email,publish_stream,publish_actions'
        path = "https://www.facebook.com/dialog/oauth?" \
               "client_id=%s" \
               "&redirect_uri=%s" \
               "&scope=%s" \
               "&state=%s" % \
               (APP_ID, callback_url, permissions, callbackToken)

        return path
Esempio n. 2
0
    def getLoginUrl(self, callbackToken, netflixAddId=None):
        #request the oauth token and secret
        token_info = self.__get('oauth/request_token')
        token = oauth.OAuthToken(
            token_info['oauth_token'].encode('ascii'),
            token_info['oauth_token_secret'].encode('ascii'),
        )
        callback_url = utils.getDomain() + ('account/linked/netflix/login_callback.json?secret=%s&state=%s' %
                                             (token_info['oauth_token_secret'].encode('ascii'), callbackToken))
        if netflixAddId is not None:
            callback_url += "&netflix_add_id=%s" % netflixAddId
        token.set_callback(callback_url)

        oauthRequest = oauth.OAuthRequest.from_consumer_and_token(self.__consumer,
            http_url = 'https://api-user.netflix.com/oauth/login',
            parameters = { 'application_name': 'Stamped' },
            token = token,
            http_method = 'GET')

        url = oauthRequest.to_url()
        return (token_info['oauth_token_secret'], url)
Esempio n. 3
0
 def getUserAccessToken(self, code, client_id=APP_ID, client_secret=APP_SECRET):
     logs.info('### getUserAccessToken called # client_id: %s, client_secret: %s, code: %s' % (client_id, client_secret, code))
     redirect_uri = utils.getDomain() + 'account/linked/facebook/login_callback.json'
     path = "oauth/access_token"
     result = self._get(
         None,
         path,
         parse_json      = False,
         client_id       = client_id,
         client_secret   = client_secret,
         code            = code,
         redirect_uri    = redirect_uri,
     )
     r = re.search('access_token=([^&]*)', result)
     token = r.group(1)
     r = re.search(r'expires=([^&]*)', result)
     expires = None
     if r is not None:
         expires = r.group(1)
         #expires = datetime.fromtimestamp(time.time() + int(expires))
     return token, expires
Esempio n. 4
0
    def crawl(self):
        self.harvestRatioData = []
        self.relevantPages = []
        webpages = []
        count = 0
        ftext = open(self.pagesDir + "webpagesTxt.txt", "w")
        webpageLabel = 0  # 0 for Non-relevant and 1 for Relevant
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):

            work_url = self.priorityQueue.pop()
            url = work_url[1]
            #if self.exists(url,1):
            #    continue
            if url in self.visited:
                continue
            #self.visited.append(url)#work_url[1])
            self.visited[url] = 1
            page = Webpage(work_url, self.pagesCount)
            if page.text == '':
                continue

            page.estimatedScore = 0
            if self.combineScore:
                page_score = 0
                if len(page.text) > 0:
                    #page_score = self.scorer.calculate_score(page.text,'W')[1]
                    page_score = self.scorer.calculate_score(page, 'W')[1]
                    if page_score == -1:
                        continue
                else:
                    print 'page text is empty'
                    continue

                page.estimatedScore = page_score

                if self.restricted:
                    if page_score < self.pageScoreThreshold:
                        #self.pagesCount += 1
                        continue

                pageDom = getDomain(url)
                if page_score >= self.pageScoreThreshold:
                    self.sourcesImp[pageDom][0] += 1
                    webpageLabel = 1
                else:
                    self.sourcesImp[pageDom][1] += 1
                    #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1]
                    webpageLabel = 0
            if self.combineScore:
                print page.pageId, ": ", str(
                    page_score), ",", -1 * work_url[0], ",", work_url[
                        1]  #,",", work_url[3]
            else:
                print -1 * work_url[0], ",", work_url[1]  #,",", work_url[3]
            self.pagesCount += 1
            #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore))
            self.relevantPages.append(
                (page.pageId, (page.pageUrl[1], page.pageUrl[2]),
                 page.estimatedScore))

            wbsStr = page.text.replace('\n', '. ').replace('\t', ' ')

            webpages.append(wbsStr)
            count += 1
            #save webpage's text to disk instead of adding to list
            # this will lead to change in evaluation
            if count % self.bufferLen == 0:
                strToWrite = '\n'.join(webpages).encode("utf-8")
                ftext.write(strToWrite)
                webpages = []
            #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w")
            #ftext.write(page.text.encode("utf-8"))
            #ftext.close()
            #-------

            if page_score < 0.1:
                continue
            page.getUrls()

            for link in page.outgoingUrls:
                url = link.address

                #if url != None and url != '':
                if url:
                    url = url.strip()
                    if url.find('report-a-typo') != -1:
                        continue
                    if url.find('m.tempo.co/') != -1:
                        continue
                    if url.find('?') != -1:
                        furl = url.split('?')[1]
                        if furl.startswith('id=') == False or furl.startswith(
                                'v=') == False or furl.startswith(
                                    'tid=') == False:
                            url = url.split('?')[0]
                    if url.find('#') != -1:
                        url = url.split('#')[0]

                    if url.endswith('/'):
                        url = url[:-1]
                    #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):
                    if url.endswith(
                        ("comment", "feed", "comments", ".rss", "video",
                         "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3",
                         "png", "share.php", "sharer.php", "login.php",
                         "print", "button", "share", "email", "submit", "post",
                         ".pdf")):
                        continue

                    #if not self.exists(url,1):
                    if url in self.visited:
                        continue
                    #tot_score = 0.0
                    if url.startswith('http'):  #and not self.exists(url,2):
                        linkText = link.getAllText()
                        #if self.mode == 1:
                        #url_score = self.scorer.calculate_score(linkText,'U')
                        url_score = self.scorer.calculate_score(link, 'U')
                        tot_score = url_score
                        if self.combineScore:
                            #tot_score= 0.4 *page_score + 0.6 *url_score

                            tot_score = page_score * url_score
                        if tot_score < self.urlScoreThreshold:
                            continue
                        urlDom = getDomain(url)

                        si_score = self.sourcesImp[urlDom][
                            0] / self.sourcesImp[urlDom][1]
                        if self.siScoreCombineMethod == 1:
                            if webpageLabel:
                                tot_score = tot_score * si_score
                        elif self.siScoreCombineMethod == 2:
                            tot_score = self.topicWeight * tot_score + self.siWeight * si_score
                        #tot_score = tot_score * si_score
                        #else:
                        #    tot_score = url_score
                        #if tot_score >= self.urlScoreThreshold:
                        #print tot_score, '-', url, linkText
                        if self.restricted:
                            if tot_score < self.urlScoreThreshold:
                                continue
                        if tot_score >= self.urlScoreThreshold:
                            self.priorityQueue.push(
                                ((-1 * tot_score), url,
                                 page.pageId))  #,linkText))
                        #else:
                        #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
            #else:
            #    self.pages.append((page,0))

        print self.priorityQueue.isempty()

        if webpages:
            strToWrite = '\n'.join(webpages).encode("utf-8")
            ftext.write(strToWrite)
        ftext.close()

        return self.priorityQueue.queue