コード例 #1
0
    def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)):
        attributes = []
        class_vars = []
        metas = [(data.StringVariable('Message'), lambda doc: doc['message']),
                 (data.DiscreteVariable('Type'), lambda doc: doc['type']),
                 (data.StringVariable('Post ID'), lambda doc: doc['post_id']),
                 (data.StringVariable('Comment ID'),
                  lambda doc: doc['comment_id']),
                 (data.StringVariable('Parent comment ID'),
                  lambda doc: doc['parent_comment_id']),
                 (data.ContinuousVariable('likes'), lambda doc: doc['likes']),
                 (data.ContinuousVariable('comment replies'),
                  lambda doc: doc['comment_replies']),
                 (data.TimeVariable('Publication Date'),
                  lambda doc: doc['status_published']),
                 (data.TimeVariable('Publication Date UTC'),
                  lambda doc: doc['status_published_utc'])]
        text_features = [metas[0][0]]
        title_indices = [-1]

        results = []
        for doc in self._getComments(post_ids, comment_replies, sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook comments', attributes,
                                  class_vars, metas, title_indices)
        c.set_text_features(text_features)
        return c
コード例 #2
0
    def search(self,
               query,
               date_from=None,
               date_to=None,
               max_docs=None,
               on_progress=None,
               should_break=None):
        """
        Args:
            query (str): Search query.
            date_from (date): Start date limit.
            date_to (date): End date limit.
            max_docs (int): Maximal number of documents returned.
            on_progress (callback): Called after every iteration of downloading.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.

        Returns:
            Corpus: Search results.
        """
        if max_docs is None or max_docs > MAX_DOCS:
            max_docs = MAX_DOCS

        # TODO create corpus on the fly and extend, so it stops faster.
        records = []
        data, go_sleep = self._fetch_page(query, date_from, date_to, 0)
        if data is None:
            return None
        records.extend(data['response']['docs'])
        max_docs = min(data['response']['meta']['hits'], max_docs)
        if callable(on_progress):
            on_progress(len(records), max_docs)

        for page in range(1, math.ceil(max_docs / BATCH_SIZE)):
            if callable(should_break) and should_break():
                break

            if go_sleep:
                sleep(SLEEP)

            data, go_sleep = self._fetch_page(query, date_from, date_to, page)

            if data is None:
                break

            records.extend(data['response']['docs'])
            if callable(on_progress):
                on_progress(len(records), max_docs)

        if len(records) > max_docs:
            records = records[:max_docs]

        return Corpus.from_documents(records,
                                     'NY Times',
                                     self.attributes,
                                     self.class_vars,
                                     self.metas,
                                     title_indices=[-1])
コード例 #3
0
def _corpus_from_results(docs):
    c = Corpus.from_documents(list(docs),
                              'AmCAT',
                              attributes=[],
                              class_vars=[],
                              metas=CORPUS_METAS,
                              title_indices=[-1])
    return c
コード例 #4
0
ファイル: nyt.py プロジェクト: nikicc/orange3-text
    def search(self, query, date_from=None, date_to=None, max_docs=None,
               on_progress=None, should_break=None):
        """
        Args:
            query (str): Search query.
            date_from (date): Start date limit.
            date_to (date): End date limit.
            max_docs (int): Maximal number of documents returned.
            on_progress (callback): Called after every iteration of downloading.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.

        Returns:
            Corpus: Search results.
        """
        if not self.api_key_valid():
            raise RuntimeError('The API key is not valid.')
        if max_docs is None or max_docs > MAX_DOCS:
            max_docs = MAX_DOCS

        # TODO create corpus on the fly and extend, so it stops faster.
        records = []
        data, cached = self._fetch_page(query, date_from, date_to, 0)
        if data is None:
            return None
        records.extend(data['response']['docs'])
        max_docs = min(data['response']['meta']['hits'], max_docs)
        if callable(on_progress):
            on_progress(len(records), max_docs)

        for page in range(1, math.ceil(max_docs/BATCH_SIZE)):
            if callable(should_break) and should_break():
                break

            data, cached = self._fetch_page(query, date_from, date_to, page)

            if data is None:
                break

            records.extend(data['response']['docs'])

            if callable(on_progress):
                on_progress(len(records), max_docs)

            if not cached:
                sleep(SLEEP)

        if len(records) > max_docs:
            records = records[:max_docs]

        return Corpus.from_documents(records, 'NY Times', self.attributes,
                                     self.class_vars, self.metas, title_indices=[-1])
コード例 #5
0
    def search_posts(self, post_ids, sub_progress=(0, 1)):
        results = []
        for doc in self._search_posts(post_ids, sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook', self.attributes,
                                  self.class_vars, self.post_metas,
                                  self.title_indices)
        c.set_text_features(self.text_features)
        return c
コード例 #6
0
ファイル: test_corpus.py プロジェクト: s-alexey/orange3-text
    def test_from_documents(self):
        documents = [
            {
                'wheels': 4,
                'engine': 'w4',
                'type': 'car',
                'desc': 'A new car.'
            },
            {
                'wheels': 8.,
                'engine': 'w8',
                'type': 'truck',
                'desc': 'An old truck.'
            },
            {
                'wheels': 12.,
                'engine': 'w12',
                'type': 'truck',
                'desc': 'An new truck.'
            }
        ]

        attrs = [
            (DiscreteVariable('Engine'), lambda doc: doc.get('engine')),
            (ContinuousVariable('Wheels'), lambda doc: doc.get('wheels')),
        ]

        class_vars = [
            (DiscreteVariable('Type'), lambda doc: doc.get('type')),
        ]

        metas = [
            (StringVariable('Description'), lambda doc: doc.get('desc')),
        ]

        dataset_name = 'TruckData'
        c = Corpus.from_documents(documents, dataset_name, attrs, class_vars, metas)

        self.assertEqual(len(c), len(documents))
        self.assertEqual(c.name, dataset_name)
        self.assertEqual(len(c.domain.attributes), len(attrs))
        self.assertEqual(len(c.domain.class_vars), len(class_vars))
        self.assertEqual(len(c.domain.metas), len(metas))

        engine_dv = c.domain.attributes[0]
        self.assertEqual(sorted(engine_dv.values),
                         sorted([d['engine'] for d in documents]))
        self.assertEqual([engine_dv.repr_val(v) for v in c.X[:, 0]],
                         [d['engine'] for d in documents])
コード例 #7
0
ファイル: test_corpus.py プロジェクト: szzyiit/orange3-text
    def test_from_documents(self):
        documents = [
            {
                'wheels': 4,
                'engine': 'w4',
                'type': 'car',
                'desc': 'A new car.'
            },
            {
                'wheels': 8.,
                'engine': 'w8',
                'type': 'truck',
                'desc': 'An old truck.'
            },
            {
                'wheels': 12.,
                'engine': 'w12',
                'type': 'truck',
                'desc': 'An new truck.'
            }
        ]

        attrs = [
            (DiscreteVariable('Engine'), lambda doc: doc.get('engine')),
            (ContinuousVariable('Wheels'), lambda doc: doc.get('wheels')),
        ]

        class_vars = [
            (DiscreteVariable('Type'), lambda doc: doc.get('type')),
        ]

        metas = [
            (StringVariable('Description'), lambda doc: doc.get('desc')),
        ]

        dataset_name = 'TruckData'
        c = Corpus.from_documents(documents, dataset_name, attrs, class_vars, metas)

        self.assertEqual(len(c), len(documents))
        self.assertEqual(c.name, dataset_name)
        self.assertEqual(len(c.domain.attributes), len(attrs))
        self.assertEqual(len(c.domain.class_vars), len(class_vars))
        self.assertEqual(len(c.domain.metas), len(metas))

        engine_dv = c.domain.attributes[0]
        self.assertEqual(sorted(engine_dv.values),
                         sorted([d['engine'] for d in documents]))
        self.assertEqual([engine_dv.repr_val(v) for v in c.X[:, 0]],
                         [d['engine'] for d in documents])
コード例 #8
0
ファイル: guardian.py プロジェクト: zhoubo3666/orange3-text
    def search(self,
               query,
               from_date=None,
               to_date=None,
               max_documents=None,
               accumulate=False):
        """
        Search The Guardian API for articles.

        Args:
            query (str): A query for searching the articles by
            from_date (str): Search only articles newer than the date provided.
                Date should be in ISO format; e.g. '2016-12-31'.
            to_date (str): Search only articles older than the date provided.
                Date should be in ISO format; e.g. '2016-12-31'.
            max_documents (int): Maximum number of documents to retrieve.
                When not given, retrieve all documents.
            accumulate (bool): A flag indicating whether to accumulate results
                of multiple consequent search calls.

        Returns:
            :ref:`Corpus`
        """
        if not accumulate:
            self.results = []

        self._search(query, from_date, to_date)

        pages = math.ceil(max_documents /
                          self.per_page) if max_documents else self.pages
        self.on_progress(self.per_page, pages * self.per_page)

        for p in range(2, pages + 1):  # to one based
            if self.should_break():
                break
            self._search(query, from_date, to_date, p)
            self.on_progress(p * self.per_page, pages * self.per_page)

        c = Corpus.from_documents(self.results,
                                  'The Guardian',
                                  self.attributes,
                                  self.class_vars,
                                  self.metas,
                                  title_indices=self.title_indices)
        c.text_features = self.text_features
        return c
コード例 #9
0
    def test_highlighting_non_latin(self):
        documents = [{
            'content': """царстве есть сад с молодильными яблоками"""
        }]
        metas = [
            (StringVariable('content'), lambda doc: doc.get('content')),
        ]
        dataset_name = 'RussianDocument'
        corpus = Corpus.from_documents(documents, dataset_name, metas=metas)

        self.send_signal(self.widget.Inputs.corpus, corpus)
        self.widget.regexp_filter = "\\bсад\\b"
        self.process_events()
        self.widget.doc_webview.html()
        spy = QSignalSpy(self.widget.doc_webview.loadFinished)
        spy.wait()
        html = self.widget.doc_webview.html()
        self.assertIn('<mark data-markjs="true">', html)
コード例 #10
0
    def search(self,
               page_ids,
               mode='posts',
               since=datetime.now() - timedelta(10),
               until=datetime.now(),
               max_documents=None,
               sub_progress=(0, 1)):
        results = []
        for doc in self._search(page_ids, mode, since, until, max_documents,
                                sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook', self.attributes,
                                  self.class_vars, self.post_metas,
                                  self.title_indices)
        c.set_text_features(self.text_features)
        return c
コード例 #11
0
            for s in m.spans():
                bd[s] = float(m.boost)
                return bd
        if hasattr(m, 'children'):
            children = m.children()
            for child in m.children():
                bd = boostdict(child, bd)
    return bd

if __name__ == '__main__':
    from Orange import data
    from orangecontrib.text.corpus import Corpus

    metas = [(data.StringVariable('headline'), lambda doc: doc['headline']),
             (data.StringVariable('text'), lambda doc: doc['text']),
             (data.StringVariable('id'), lambda doc: doc['id'])]
    text_features = [metas[0][0], metas[1][0]]
    title_indices = [-1]
    d = [{'headline': 'titel_a', 'text': 'x x y this tests a test y x x', 'id': '1'},
         {'headline': 'titel_b', 'text': 'more tests!!!', 'id': '2'}]
    c = Corpus.from_documents(d, 'example', [], [], metas, title_indices)
    c.set_text_features(text_features)

    _ix = Index(c)

    import numpy as np
    terms = list(_ix.term_statistics())
    words = np.array([["a"], ["b"]])
    print(words)