def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)): attributes = [] class_vars = [] metas = [(data.StringVariable('Message'), lambda doc: doc['message']), (data.DiscreteVariable('Type'), lambda doc: doc['type']), (data.StringVariable('Post ID'), lambda doc: doc['post_id']), (data.StringVariable('Comment ID'), lambda doc: doc['comment_id']), (data.StringVariable('Parent comment ID'), lambda doc: doc['parent_comment_id']), (data.ContinuousVariable('likes'), lambda doc: doc['likes']), (data.ContinuousVariable('comment replies'), lambda doc: doc['comment_replies']), (data.TimeVariable('Publication Date'), lambda doc: doc['status_published']), (data.TimeVariable('Publication Date UTC'), lambda doc: doc['status_published_utc'])] text_features = [metas[0][0]] title_indices = [-1] results = [] for doc in self._getComments(post_ids, comment_replies, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook comments', attributes, class_vars, metas, title_indices) c.set_text_features(text_features) return c
def search(self, query, date_from=None, date_to=None, max_docs=None, on_progress=None, should_break=None): """ Args: query (str): Search query. date_from (date): Start date limit. date_to (date): End date limit. max_docs (int): Maximal number of documents returned. on_progress (callback): Called after every iteration of downloading. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. Returns: Corpus: Search results. """ if max_docs is None or max_docs > MAX_DOCS: max_docs = MAX_DOCS # TODO create corpus on the fly and extend, so it stops faster. records = [] data, go_sleep = self._fetch_page(query, date_from, date_to, 0) if data is None: return None records.extend(data['response']['docs']) max_docs = min(data['response']['meta']['hits'], max_docs) if callable(on_progress): on_progress(len(records), max_docs) for page in range(1, math.ceil(max_docs / BATCH_SIZE)): if callable(should_break) and should_break(): break if go_sleep: sleep(SLEEP) data, go_sleep = self._fetch_page(query, date_from, date_to, page) if data is None: break records.extend(data['response']['docs']) if callable(on_progress): on_progress(len(records), max_docs) if len(records) > max_docs: records = records[:max_docs] return Corpus.from_documents(records, 'NY Times', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def _corpus_from_results(docs): c = Corpus.from_documents(list(docs), 'AmCAT', attributes=[], class_vars=[], metas=CORPUS_METAS, title_indices=[-1]) return c
def search(self, query, date_from=None, date_to=None, max_docs=None, on_progress=None, should_break=None): """ Args: query (str): Search query. date_from (date): Start date limit. date_to (date): End date limit. max_docs (int): Maximal number of documents returned. on_progress (callback): Called after every iteration of downloading. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. Returns: Corpus: Search results. """ if not self.api_key_valid(): raise RuntimeError('The API key is not valid.') if max_docs is None or max_docs > MAX_DOCS: max_docs = MAX_DOCS # TODO create corpus on the fly and extend, so it stops faster. records = [] data, cached = self._fetch_page(query, date_from, date_to, 0) if data is None: return None records.extend(data['response']['docs']) max_docs = min(data['response']['meta']['hits'], max_docs) if callable(on_progress): on_progress(len(records), max_docs) for page in range(1, math.ceil(max_docs/BATCH_SIZE)): if callable(should_break) and should_break(): break data, cached = self._fetch_page(query, date_from, date_to, page) if data is None: break records.extend(data['response']['docs']) if callable(on_progress): on_progress(len(records), max_docs) if not cached: sleep(SLEEP) if len(records) > max_docs: records = records[:max_docs] return Corpus.from_documents(records, 'NY Times', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def search_posts(self, post_ids, sub_progress=(0, 1)): results = [] for doc in self._search_posts(post_ids, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook', self.attributes, self.class_vars, self.post_metas, self.title_indices) c.set_text_features(self.text_features) return c
def test_from_documents(self): documents = [ { 'wheels': 4, 'engine': 'w4', 'type': 'car', 'desc': 'A new car.' }, { 'wheels': 8., 'engine': 'w8', 'type': 'truck', 'desc': 'An old truck.' }, { 'wheels': 12., 'engine': 'w12', 'type': 'truck', 'desc': 'An new truck.' } ] attrs = [ (DiscreteVariable('Engine'), lambda doc: doc.get('engine')), (ContinuousVariable('Wheels'), lambda doc: doc.get('wheels')), ] class_vars = [ (DiscreteVariable('Type'), lambda doc: doc.get('type')), ] metas = [ (StringVariable('Description'), lambda doc: doc.get('desc')), ] dataset_name = 'TruckData' c = Corpus.from_documents(documents, dataset_name, attrs, class_vars, metas) self.assertEqual(len(c), len(documents)) self.assertEqual(c.name, dataset_name) self.assertEqual(len(c.domain.attributes), len(attrs)) self.assertEqual(len(c.domain.class_vars), len(class_vars)) self.assertEqual(len(c.domain.metas), len(metas)) engine_dv = c.domain.attributes[0] self.assertEqual(sorted(engine_dv.values), sorted([d['engine'] for d in documents])) self.assertEqual([engine_dv.repr_val(v) for v in c.X[:, 0]], [d['engine'] for d in documents])
def search(self, query, from_date=None, to_date=None, max_documents=None, accumulate=False): """ Search The Guardian API for articles. Args: query (str): A query for searching the articles by from_date (str): Search only articles newer than the date provided. Date should be in ISO format; e.g. '2016-12-31'. to_date (str): Search only articles older than the date provided. Date should be in ISO format; e.g. '2016-12-31'. max_documents (int): Maximum number of documents to retrieve. When not given, retrieve all documents. accumulate (bool): A flag indicating whether to accumulate results of multiple consequent search calls. Returns: :ref:`Corpus` """ if not accumulate: self.results = [] self._search(query, from_date, to_date) pages = math.ceil(max_documents / self.per_page) if max_documents else self.pages self.on_progress(self.per_page, pages * self.per_page) for p in range(2, pages + 1): # to one based if self.should_break(): break self._search(query, from_date, to_date, p) self.on_progress(p * self.per_page, pages * self.per_page) c = Corpus.from_documents(self.results, 'The Guardian', self.attributes, self.class_vars, self.metas, title_indices=self.title_indices) c.text_features = self.text_features return c
def test_highlighting_non_latin(self): documents = [{ 'content': """царстве есть сад с молодильными яблоками""" }] metas = [ (StringVariable('content'), lambda doc: doc.get('content')), ] dataset_name = 'RussianDocument' corpus = Corpus.from_documents(documents, dataset_name, metas=metas) self.send_signal(self.widget.Inputs.corpus, corpus) self.widget.regexp_filter = "\\bсад\\b" self.process_events() self.widget.doc_webview.html() spy = QSignalSpy(self.widget.doc_webview.loadFinished) spy.wait() html = self.widget.doc_webview.html() self.assertIn('<mark data-markjs="true">', html)
def search(self, page_ids, mode='posts', since=datetime.now() - timedelta(10), until=datetime.now(), max_documents=None, sub_progress=(0, 1)): results = [] for doc in self._search(page_ids, mode, since, until, max_documents, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook', self.attributes, self.class_vars, self.post_metas, self.title_indices) c.set_text_features(self.text_features) return c
for s in m.spans(): bd[s] = float(m.boost) return bd if hasattr(m, 'children'): children = m.children() for child in m.children(): bd = boostdict(child, bd) return bd if __name__ == '__main__': from Orange import data from orangecontrib.text.corpus import Corpus metas = [(data.StringVariable('headline'), lambda doc: doc['headline']), (data.StringVariable('text'), lambda doc: doc['text']), (data.StringVariable('id'), lambda doc: doc['id'])] text_features = [metas[0][0], metas[1][0]] title_indices = [-1] d = [{'headline': 'titel_a', 'text': 'x x y this tests a test y x x', 'id': '1'}, {'headline': 'titel_b', 'text': 'more tests!!!', 'id': '2'}] c = Corpus.from_documents(d, 'example', [], [], metas, title_indices) c.set_text_features(text_features) _ix = Index(c) import numpy as np terms = list(_ix.term_statistics()) words = np.array([["a"], ["b"]]) print(words)