def search(self, lang, queries, articles_per_query=10, should_break=None, on_progress=None): """ Searches for articles. Args: lang(str): A language code in ISO 639-1 format. queries(list of str): A list of queries. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. on_progress (callable): Callback for progress bar. """ wikipedia.set_lang(lang) results = [] for i, query in enumerate(queries): try: articles = wikipedia.search(query, results=articles_per_query) for j, article in enumerate(articles): if callable(should_break) and should_break(): break results.extend(self._get(article, query, should_break)) if callable(on_progress): on_progress((i*articles_per_query + j+1) / (len(queries) * articles_per_query), len(results)) except (wikipedia.exceptions.HTTPTimeoutError, IOError) as e: self.on_error(str(e)) break if callable(should_break) and should_break(): break return Corpus.from_documents(results, 'Wikipedia', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def create_corpus(self): return Corpus.from_documents(self.tweets, 'Twitter', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def create_corpus(self): """ Creates a corpus with collected tweets. """ self.statuses_lock.acquire() corpus = Corpus.from_documents(self.tweets, 'Twitter', self.attributes, self.class_vars, self.metas, title_indices=[-2]) self.statuses_lock.release() return corpus
def create_corpus(self, search_author): if search_author: class_vars = self.authors metas = self.metas else: class_vars = [] metas = self.metas + self.authors return Corpus.from_documents(self.tweets, 'Twitter', self.attributes, class_vars, metas, title_indices=[-1])
def search(self, apikey, query, articles_per_query=100): """ Searches for articles.""" er = EventRegistry(apiKey=apikey) q = QueryArticles(keywords=query) articles = {} results = [] q.setRequestedResult( RequestArticlesInfo( page=1, count=articles_per_query, returnInfo=ReturnInfo( articleInfo=ArticleInfoFlags(duplicateList=True, concepts=True, categories=True, location=True, socialScore=True, sentiment=True, dates=True)))) res = er.execQuery(q) articles = res['articles'] results = articles['results'] simplized_results = [] for result in results: simplized_result = {} author = '' if len(result['authors']) == 0: author = 'None' else: author = result['authors'][0]['name'] simplized_result['title'] = result['title'] simplized_result['date'] = result['date'] simplized_result['time'] = result['date'] simplized_result['dataType'] = result['date'] simplized_result['author'] = result['date'] simplized_result['url'] = result['date'] simplized_result['dateCrawl'] = result['date'] simplized_result['timeCrawl'] = result['date'] simplized_result['sentiment'] = result['date'] simplized_result['body'] = result['date'] simplized_results += simplized_result return Corpus.from_documents(simplized_results, 'EventRegistry', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def search(self, lang, queries, articles_per_query=10, should_break=None, on_progress=None): """ Searches for articles. Args: lang(str): A language code in ISO 639-1 format. queries(list of str): A list of queries. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. on_progress (callable): Callback for progress bar. """ wikipedia.set_lang(lang) results = [] for i, query in enumerate(queries): try: articles = wikipedia.search(query, results=articles_per_query) for j, article in enumerate(articles): if callable(should_break) and should_break(): break results.extend(self._get(article, query, should_break)) if callable(on_progress): on_progress((i * articles_per_query + j + 1) / (len(queries) * articles_per_query), len(results)) except (wikipedia.exceptions.HTTPTimeoutError, IOError) as e: self.on_error(str(e)) break if callable(should_break) and should_break(): break return Corpus.from_documents(results, 'Wikipedia', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def corpus(*texts): metas = [(data.StringVariable('text'), lambda doc: doc['text'])] d = [{'text': t} for t in texts] c = Corpus.from_documents(d, 'example', [], [], metas, [-1]) c.set_text_features([metas[0][0]]) return c