Esempio n. 1
0
def html2text(request):
    text = ''
    posted_url = request.POST.get('url', '')
    if posted_url:
        d = Document(posted_url)
        text = d.text()
    context = Context({
        'text' : text
    })
    template = get_template('html2text.html')
    html = template.render(context)
    return HttpResponse(html)
Esempio n. 2
0
 def doc(self):
     cache_key = "url_" + self.url
     cache_doc = cache.get(cache_key)
     if cache_doc:
         print("Cache hit: URL: " + self.url)
         doc = cache_doc
     else:
         doc = Document(self.url)
         cache.set(cache_key, doc, 60)
     return doc
Esempio n. 3
0
    def data(self, all_fields=True):
        """
            Calculate context of quotation using QuoteContext class
            Optionally return a smaller subset of fields to upload to cloud
        """

        data_dict = {
            'sha1': self.hash(),
            'citing_url': self.citing_url,
            'cited_url': self.cited_url,
        }

        # Get text version of document
        citing_doc = Document(self.citing_url)
        cited_doc = Document(self.cited_url)

        # Populate context fields with Document methods
        document_fields = ['doc_type', 'text']
        quote_context_fields = [
            'context_before', 'context_after',  # 'quote',
            'quote_length',
            'quote',
            'quote_start_position', 'quote_end_position',
            'context_start_position', 'context_end_position',
        ]

        if self.raw_output:
            data_dict['citing_raw'] = citing_doc.raw()
            data_dict['cited_raw'] = cited_doc.raw()

        # if self.text_output:
        #    quote_context_fields.append('text')

        for doc_field in document_fields:
            citing_field = ''.join(['citing_', doc_field])
            cited_field = ''.join(['cited_', doc_field])
            data_dict[citing_field] = citing_doc.data()[doc_field]
            data_dict[cited_field] = cited_doc.data()[doc_field]

        # Find context of quote from within text
        citing_context = QuoteContext(self.citing_quote, citing_doc.text())
        cited_context = QuoteContext(self.citing_quote, cited_doc.text())

        for field in quote_context_fields:
            citing_field = ''.join(['citing_', field])
            cited_field = ''.join(['cited_', field])

            data_dict[citing_field] = citing_context.data()[field]
            data_dict[cited_field] = cited_context.data()[field]

        # Stop Elapsed Timer
        elapsed_time = time.time() - self.start_time
        data_dict['create_elapsed_time'] = format(elapsed_time, '.5f')

        if not all_fields:
            excluded_fields = [
                'cited_raw', 'citing_raw',
                'citing_text', 'cited_text',
                'citing_quote_length',
                'cited_quote_start_position', 'citing_quote_start_position',
                'cited_quote_end_position', 'citing_quote_end_position',
                'cited_context_start_position',
                'citing_context_start_position',
                'cited_context_end_position', 'citing_context_end_position',
                'create_elapsed_time',
            ]  # 'cited_cache_url', 'cited_archive_url',

            for excluded_field in excluded_fields:
                data_dict.pop(excluded_field)

        return data_dict
Esempio n. 4
0
    def data(self, all_fields=True):
        """
            Calculate context of quotation using QuoteContext class
            Optionally return a smaller subset of fields to upload to cloud
        """
        cached_data = cache.get(self.cache_key())
        if cached_data:
            print("Returning cached Quote Data: " + self.cited_url)
            return cached_data
        else:
            print("Looking up data() for " + self.cited_url)
            data_dict = {
                'sha1': self.hash(),
                'citing_url': self.citing_url,
                'cited_url': self.cited_url,
            }

            # Get text version of document if text not passed into object
            citing_text = self.citing_text
            citing_raw = self.citing_raw
            print("Populating Document: Doc, Text, Raw")
            if (len(citing_text) == 0) or (len(citing_raw) == 0):
                citing_text = self.citing_text
                citing_raw = self.citing_raw
            cited_doc = Document(self.cited_url)
            print("3)----data()['text']-----")
            cited_text = cited_doc.text()
            print("End Populating")

            data_dict['citing_text'] = citing_text
            data_dict['cited_text'] = cited_text

            # Find context of quote from within text
            citing_context_data = QuoteContext(self.citing_quote,
                                               citing_text).data()
            cited_context_data = QuoteContext(self.citing_quote,
                                              cited_text).data()

            # Populate context fields with Document methods
            quote_context_fields = [
                'context_before',
                'context_after',  # 'quote',
                'quote_length',
                'quote',
                'quote_start_position',
                'quote_end_position',
                'context_start_position',
                'context_end_position',
            ]
            for field in quote_context_fields:
                citing_field = ''.join(['citing_', field])
                cited_field = ''.join(['cited_', field])

                data_dict[citing_field] = citing_context_data[field]
                data_dict[cited_field] = cited_context_data[field]

            # Stop Elapsed Timer
            elapsed_time = time.time() - self.start_time
            data_dict['create_elapsed_time'] = format(elapsed_time, '.5f')

            if not all_fields:
                excluded_fields = [
                    'cited_raw',
                    'citing_raw',
                    'citing_text',
                    'cited_text',
                    'citing_quote_length',
                    'cited_quote_start_position',
                    'citing_quote_start_position',
                    'cited_quote_end_position',
                    'citing_quote_end_position',
                    'cited_context_start_position',
                    'citing_context_start_position',
                    'cited_context_end_position',
                    'citing_context_end_position',
                    'create_elapsed_time',
                ]  # 'cited_cache_url', 'cited_archive_url',

                for excluded_field in excluded_fields:
                    data_dict.pop(excluded_field)

            cache.set(self.cache_key(), data_dict, 60)
            return data_dict
Esempio n. 5
0
 def raw(self):
     raw = ''
     doc = Document(self.url)
     raw = doc.raw()
     return raw