Exemple #1
0
    def citations(self):
        """ Return a list of Quote Lookup results for all citations on this page
            Uses asychronous pool to achieve parallel processing
            calls load_quote_data() function
            for all values in self.citations_list_dict
            using python 'map' function
        """
        result_list = []

        # Pre-fetch and cache URL if it is found in more than one quote
        # this prevents sources from being clobbered with multiple requests in parallel
        for url in self.citations_list_duplicates():
            d = Document(url)
            d.download_resource(
            )  # request and cache result so parallel requests come from cache

        # Load Quote data in parallel:
        citations_list_dict = self.citations_list_dict()

        pool = Pool(processes=settings.NUM_DOWNLOAD_PROCESSES)
        # try:
        result_list = pool.map(load_quote_data, citations_list_dict)
        # except (NameError, ValueError):
        # TODO: add better error handling
        # print("Skipping map value ..")
        pool.close()
        pool.join()
        return result_list
Exemple #2
0
def document_text_version():
    url = request.args.get('url', '')
    line_separator = request.args.get('line_separator', '')
    timesplits = request.args.get('timesplits')
    if timesplits:
        timesplits = True
    else:
        timesplits = False

    d = Document(url, line_separator, timesplits)

    response = app.make_response(d.text())
    response.mimetype = "text"
    return response
Exemple #3
0
    def data(self, text_output=True, all_fields=False):
        """
            Calculate context of quotation using QuoteContext class
            Optionally return a smaller subset of fields to upload to cloud
        """

        data_dict = {
            'sha1': self.hash(),
            'citing_url': self.citing_url,
            'cited_url': self.cited_url,
        }

        # Get text version of document if text not passed into object
        citing_text = self.citing_text
        citing_raw = self.citing_raw
        citing_doc = None
        if (len(citing_text) == 0) or (len(citing_raw) == 0):
            citing_doc = Document(self.citing_url)
            citing_text = citing_doc.data()['text']
            citing_raw = citing_doc.data()['raw']
        cited_doc = Document(self.cited_url)
        cited_text = cited_doc.data()['text']

        if self.raw_output and citing_doc:
            data_dict['citing_raw'] = citing_doc.raw()
            data_dict['cited_raw'] = cited_doc.raw()

        data_dict['citing_text'] = citing_text
        data_dict['cited_text'] = cited_text

        # data_dict['citing_doc_type'] = citing_doc.data()['doc_type']
        # data_dict['cited_doc_type'] = cited_doc.data()['doc_type']

        # Find context of quote from within text
        citing_context = QuoteContext(self.citing_quote, citing_text)
        cited_context = QuoteContext(self.citing_quote, cited_text)

        # Populate context fields with Document methods
        quote_context_fields = [
            'context_before', 'context_after', 'quote', 'quote_start_position',
            'quote_end_position', 'context_start_position',
            'context_end_position', 'quote_length'
            # 'encoding', 'encoding_confidence', 'language'
        ]
        for field in quote_context_fields:
            citing_field = ''.join(['citing_', field])
            cited_field = ''.join(['cited_', field])

            data_dict[citing_field] = citing_context.data()[field]
            data_dict[cited_field] = cited_context.data()[field]

        # Stop Elapsed Timer
        elapsed_time = time.time() - self.start_time
        data_dict['create_elapsed_time'] = format(elapsed_time, '.5f')

        if not self.text_output:
            excluded_fields = ['citing_text', 'cited_text']
            for excluded_field in excluded_fields:
                data_dict.pop(excluded_field)

        if not all_fields:
            excluded_fields = [
                'cited_raw',
                'citing_raw',
                'citing_quote_length',
                'cited_quote_start_position',
                'citing_quote_start_position',
                'cited_quote_end_position',
                'citing_quote_end_position',
                'cited_context_start_position',
                'citing_context_start_position',
                'cited_context_end_position',
                'citing_context_end_position',
                'create_elapsed_time',
            ]
            for excluded_field in excluded_fields:
                data_dict.pop(excluded_field, None)

        return data_dict
Exemple #4
0
def document_meta_data():
    url = request.args.get('url', '')
    verbose_view = request.args.get('verbose', True)
    d = Document(url)
    document_data = d.data(verbose_view=verbose_view)
    return jsonify(document_data)
Exemple #5
0
 def doc(self):
     """ Call Document class """
     return Document(self.url)
Exemple #6
0
 def citing_doc(self):
     """ Get Document of citing url """
     return Document(self.citing_url(), self.request_id)
Exemple #7
0
 def doc(self):
     return Document(self.url)
Exemple #8
0
# Copyright (C) 2015-2020 Tim Langeman and contributors
# <see AUTHORS.txt file>
#
# This library is part of the CiteIt project:
# http://www.citeit.net/

# The code for this server library is released under the MIT License:
# http://www.opensource.org/licenses/mit-license

from lib.citeit_quote_context.quote import Quote

# q = Quote(
# "<p>Be <b>conservative</b> in what you send, be <b>liberal</b> in what you accept</p>",
# "https://192.168.64.2/2017/04/12/was-jesus-a-postel-christian/",
# "https://en.wikipedia.org/wiki/Robustness_principle"
# )

#print(q.hash())

#print(q.hashkey())

from lib.citeit_quote_context.document import Document

url = 'https://www.youtube.com/watch?v=Okg2LH6XKzY&feature=youtu.be'

d = Document(url)

text = d.text()

print(text)