def citations(self): """ Return a list of Quote Lookup results for all citations on this page Uses asychronous pool to achieve parallel processing calls load_quote_data() function for all values in self.citations_list_dict using python 'map' function """ result_list = [] # Pre-fetch and cache URL if it is found in more than one quote # this prevents sources from being clobbered with multiple requests in parallel for url in self.citations_list_duplicates(): d = Document(url) d.download_resource( ) # request and cache result so parallel requests come from cache # Load Quote data in parallel: citations_list_dict = self.citations_list_dict() pool = Pool(processes=settings.NUM_DOWNLOAD_PROCESSES) # try: result_list = pool.map(load_quote_data, citations_list_dict) # except (NameError, ValueError): # TODO: add better error handling # print("Skipping map value ..") pool.close() pool.join() return result_list
def document_text_version(): url = request.args.get('url', '') line_separator = request.args.get('line_separator', '') timesplits = request.args.get('timesplits') if timesplits: timesplits = True else: timesplits = False d = Document(url, line_separator, timesplits) response = app.make_response(d.text()) response.mimetype = "text" return response
def data(self, text_output=True, all_fields=False): """ Calculate context of quotation using QuoteContext class Optionally return a smaller subset of fields to upload to cloud """ data_dict = { 'sha1': self.hash(), 'citing_url': self.citing_url, 'cited_url': self.cited_url, } # Get text version of document if text not passed into object citing_text = self.citing_text citing_raw = self.citing_raw citing_doc = None if (len(citing_text) == 0) or (len(citing_raw) == 0): citing_doc = Document(self.citing_url) citing_text = citing_doc.data()['text'] citing_raw = citing_doc.data()['raw'] cited_doc = Document(self.cited_url) cited_text = cited_doc.data()['text'] if self.raw_output and citing_doc: data_dict['citing_raw'] = citing_doc.raw() data_dict['cited_raw'] = cited_doc.raw() data_dict['citing_text'] = citing_text data_dict['cited_text'] = cited_text # data_dict['citing_doc_type'] = citing_doc.data()['doc_type'] # data_dict['cited_doc_type'] = cited_doc.data()['doc_type'] # Find context of quote from within text citing_context = QuoteContext(self.citing_quote, citing_text) cited_context = QuoteContext(self.citing_quote, cited_text) # Populate context fields with Document methods quote_context_fields = [ 'context_before', 'context_after', 'quote', 'quote_start_position', 'quote_end_position', 'context_start_position', 'context_end_position', 'quote_length' # 'encoding', 'encoding_confidence', 'language' ] for field in quote_context_fields: citing_field = ''.join(['citing_', field]) cited_field = ''.join(['cited_', field]) data_dict[citing_field] = citing_context.data()[field] data_dict[cited_field] = cited_context.data()[field] # Stop Elapsed Timer elapsed_time = time.time() - self.start_time data_dict['create_elapsed_time'] = format(elapsed_time, '.5f') if not self.text_output: excluded_fields = ['citing_text', 'cited_text'] for excluded_field in excluded_fields: data_dict.pop(excluded_field) if not all_fields: excluded_fields = [ 'cited_raw', 'citing_raw', 'citing_quote_length', 'cited_quote_start_position', 'citing_quote_start_position', 'cited_quote_end_position', 'citing_quote_end_position', 'cited_context_start_position', 'citing_context_start_position', 'cited_context_end_position', 'citing_context_end_position', 'create_elapsed_time', ] for excluded_field in excluded_fields: data_dict.pop(excluded_field, None) return data_dict
def document_meta_data(): url = request.args.get('url', '') verbose_view = request.args.get('verbose', True) d = Document(url) document_data = d.data(verbose_view=verbose_view) return jsonify(document_data)
def doc(self): """ Call Document class """ return Document(self.url)
def citing_doc(self): """ Get Document of citing url """ return Document(self.citing_url(), self.request_id)
def doc(self): return Document(self.url)
# Copyright (C) 2015-2020 Tim Langeman and contributors # <see AUTHORS.txt file> # # This library is part of the CiteIt project: # http://www.citeit.net/ # The code for this server library is released under the MIT License: # http://www.opensource.org/licenses/mit-license from lib.citeit_quote_context.quote import Quote # q = Quote( # "<p>Be <b>conservative</b> in what you send, be <b>liberal</b> in what you accept</p>", # "https://192.168.64.2/2017/04/12/was-jesus-a-postel-christian/", # "https://en.wikipedia.org/wiki/Robustness_principle" # ) #print(q.hash()) #print(q.hashkey()) from lib.citeit_quote_context.document import Document url = 'https://www.youtube.com/watch?v=Okg2LH6XKzY&feature=youtu.be' d = Document(url) text = d.text() print(text)