Python loadFromUrl Exemples, src.search.SearchResultParsing.loadFromUrl Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : YahooKeywordQueryBuilder.py Projet : jtedesco/EntityQuerier

    def extractKeywords(self, entityDescription):
        """
          Retrieve the set of keywords for the entity description
        """

        keywords = set([])

        # Fill the URL template
        url = self.apiUrl.replace("####", entityDescription)
        url = url.replace(" ", "%20")
        url = url.replace("AT&T", "")  # Blows up Yahoo API for some reason...

        # Get the keyword data, from the cache if possible
        keywordJson = self.cache.read(url)
        if keywordJson is None:
            keywordJson = loadFromUrl(url)
            self.cache.write(url, keywordJson)
        keywordData = loads(keywordJson)
        try:
            fetchedKeywords = keywordData["query"]["results"].values()
            keywords = keywords.union(set(fetchedKeywords[0]))
        except AttributeError:
            pass

        return list(keywords)

Exemple #2

0

Afficher le fichier

Fichier : YQLKeywordExtension.py Projet : jtedesco/EntityQuerier

    def getKeywordsFromContent(self, content):
        """
          Retrieve the Yahoo keywords from some content, by splitting the content and joining
            the results if the content is too long to be sent via a URL.

            @param  content The content from which to retrieve the keyword information.
        """

        # Split the content
        content = self.__cleanResults(content)
        contentChunks = self.__group(content, self.contentSize)

        # Get the keywords for each chunk
        keywords = set([])
        for chunk in contentChunks:

            # Fill the URL template
            url = self.apiUrl.replace('####', chunk)
            url = url.replace(' ', '%20')

            # Get the keyword data, from the cache if possible
            keywordJson = self.cache.read(url)
            if keywordJson is None:
                keywordJson = loadFromUrl(url)
                self.cache.write(url, keywordJson)
            keywordData = loads(keywordJson)
            try:
                fetchedKeywords = keywordData['query']['results'].values()
                keywords = keywords.union(set(fetchedKeywords[0]))
            except AttributeError:
                pass

        return list(keywords)

Exemple #3

0

Afficher le fichier

Fichier : DMOZCrawlerThread.py Projet : jtedesco/EntityQuerier

    def run(self):
        """
          Parse the content of this page, and update the given dictionary for this thread
        """

        try:

            # Get the content from this page
            print "Getting page content for '%s'" % self.url.strip()

            filename = self.__encodeCacheFilename(self.url)

            if not os.path.exists(filename):
                try:
                    content = loadFromUrl(self.url)
                except ValueError:
                    content = None
                    print "Error with URL: " + self.url

                # Extract the content from this page
                if content is not None and isHTML(content):

                    self.resultDictionary['content'] = content

                    # Get the information about this url
                    content = content.lower()
                    if self.saveData:

                        try:
                            title, keywords, description = parseMetaDataFromContent(content)
                            pageRank = self.prCache.getPageRank(self.url)
                            headers = parseHeaderInformationFromContent(content)

                            # Get the YQL keywords for this DMOZ document
                            try:
                                yqlKeywordsExtension = YQLKeywordExtension()
                                yqlKeywords = yqlKeywordsExtension.getKeywordsFromContent(content)
                            except Exception:
                                yqlKeywords = []

                            # Store the extra data
                            self.resultDictionary['keywords'] = keywords
                            self.resultDictionary['headers'] = headers
                            self.resultDictionary['description'] = description
                            self.resultDictionary['yqlKeywords'] = yqlKeywords
                            self.resultDictionary['pageRank'] = pageRank
                            self.resultDictionary['title'] = title

                            # Save the result file
                            dump(self.resultDictionary, open(filename, 'w'))
                        except UnicodeDecodeError:
                            print "Failed to save DMOZ document: " + self.url


        except URLError:
            print("Error accessing '%s', %s" % (self.url.strip(), str(sys.exc_info()[1]).strip()))