def html2text(request): text = '' posted_url = request.POST.get('url', '') if posted_url: d = Document(posted_url) text = d.text() context = Context({ 'text' : text }) template = get_template('html2text.html') html = template.render(context) return HttpResponse(html)
def doc(self): cache_key = "url_" + self.url cache_doc = cache.get(cache_key) if cache_doc: print("Cache hit: URL: " + self.url) doc = cache_doc else: doc = Document(self.url) cache.set(cache_key, doc, 60) return doc
def data(self, all_fields=True): """ Calculate context of quotation using QuoteContext class Optionally return a smaller subset of fields to upload to cloud """ data_dict = { 'sha1': self.hash(), 'citing_url': self.citing_url, 'cited_url': self.cited_url, } # Get text version of document citing_doc = Document(self.citing_url) cited_doc = Document(self.cited_url) # Populate context fields with Document methods document_fields = ['doc_type', 'text'] quote_context_fields = [ 'context_before', 'context_after', # 'quote', 'quote_length', 'quote', 'quote_start_position', 'quote_end_position', 'context_start_position', 'context_end_position', ] if self.raw_output: data_dict['citing_raw'] = citing_doc.raw() data_dict['cited_raw'] = cited_doc.raw() # if self.text_output: # quote_context_fields.append('text') for doc_field in document_fields: citing_field = ''.join(['citing_', doc_field]) cited_field = ''.join(['cited_', doc_field]) data_dict[citing_field] = citing_doc.data()[doc_field] data_dict[cited_field] = cited_doc.data()[doc_field] # Find context of quote from within text citing_context = QuoteContext(self.citing_quote, citing_doc.text()) cited_context = QuoteContext(self.citing_quote, cited_doc.text()) for field in quote_context_fields: citing_field = ''.join(['citing_', field]) cited_field = ''.join(['cited_', field]) data_dict[citing_field] = citing_context.data()[field] data_dict[cited_field] = cited_context.data()[field] # Stop Elapsed Timer elapsed_time = time.time() - self.start_time data_dict['create_elapsed_time'] = format(elapsed_time, '.5f') if not all_fields: excluded_fields = [ 'cited_raw', 'citing_raw', 'citing_text', 'cited_text', 'citing_quote_length', 'cited_quote_start_position', 'citing_quote_start_position', 'cited_quote_end_position', 'citing_quote_end_position', 'cited_context_start_position', 'citing_context_start_position', 'cited_context_end_position', 'citing_context_end_position', 'create_elapsed_time', ] # 'cited_cache_url', 'cited_archive_url', for excluded_field in excluded_fields: data_dict.pop(excluded_field) return data_dict
def data(self, all_fields=True): """ Calculate context of quotation using QuoteContext class Optionally return a smaller subset of fields to upload to cloud """ cached_data = cache.get(self.cache_key()) if cached_data: print("Returning cached Quote Data: " + self.cited_url) return cached_data else: print("Looking up data() for " + self.cited_url) data_dict = { 'sha1': self.hash(), 'citing_url': self.citing_url, 'cited_url': self.cited_url, } # Get text version of document if text not passed into object citing_text = self.citing_text citing_raw = self.citing_raw print("Populating Document: Doc, Text, Raw") if (len(citing_text) == 0) or (len(citing_raw) == 0): citing_text = self.citing_text citing_raw = self.citing_raw cited_doc = Document(self.cited_url) print("3)----data()['text']-----") cited_text = cited_doc.text() print("End Populating") data_dict['citing_text'] = citing_text data_dict['cited_text'] = cited_text # Find context of quote from within text citing_context_data = QuoteContext(self.citing_quote, citing_text).data() cited_context_data = QuoteContext(self.citing_quote, cited_text).data() # Populate context fields with Document methods quote_context_fields = [ 'context_before', 'context_after', # 'quote', 'quote_length', 'quote', 'quote_start_position', 'quote_end_position', 'context_start_position', 'context_end_position', ] for field in quote_context_fields: citing_field = ''.join(['citing_', field]) cited_field = ''.join(['cited_', field]) data_dict[citing_field] = citing_context_data[field] data_dict[cited_field] = cited_context_data[field] # Stop Elapsed Timer elapsed_time = time.time() - self.start_time data_dict['create_elapsed_time'] = format(elapsed_time, '.5f') if not all_fields: excluded_fields = [ 'cited_raw', 'citing_raw', 'citing_text', 'cited_text', 'citing_quote_length', 'cited_quote_start_position', 'citing_quote_start_position', 'cited_quote_end_position', 'citing_quote_end_position', 'cited_context_start_position', 'citing_context_start_position', 'cited_context_end_position', 'citing_context_end_position', 'create_elapsed_time', ] # 'cited_cache_url', 'cited_archive_url', for excluded_field in excluded_fields: data_dict.pop(excluded_field) cache.set(self.cache_key(), data_dict, 60) return data_dict
def raw(self): raw = '' doc = Document(self.url) raw = doc.raw() return raw