Ejemplo n.º 1
0
    def search(self, lang, queries, articles_per_query=10, should_break=None, on_progress=None):
        """ Searches for articles.

        Args:
            lang(str): A language code in ISO 639-1 format.
            queries(list of str): A list of queries.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.
            on_progress (callable): Callback for progress bar.
        """
        wikipedia.set_lang(lang)

        results = []
        for i, query in enumerate(queries):
            try:
                articles = wikipedia.search(query, results=articles_per_query)
                for j, article in enumerate(articles):
                    if callable(should_break) and should_break():
                        break

                    results.extend(self._get(article, query, should_break))

                    if callable(on_progress):
                        on_progress((i*articles_per_query + j+1) / (len(queries) * articles_per_query),
                                    len(results))
            except (wikipedia.exceptions.HTTPTimeoutError, IOError) as e:
                self.on_error(str(e))
                break

            if callable(should_break) and should_break():
                break

        return Corpus.from_documents(results, 'Wikipedia', self.attributes,
                                     self.class_vars, self.metas, title_indices=[-1])
Ejemplo n.º 2
0
 def create_corpus(self):
     return Corpus.from_documents(self.tweets,
                                  'Twitter',
                                  self.attributes,
                                  self.class_vars,
                                  self.metas,
                                  title_indices=[-1])
Ejemplo n.º 3
0
 def create_corpus(self):
     """ Creates a corpus with collected tweets. """
     self.statuses_lock.acquire()
     corpus = Corpus.from_documents(self.tweets, 'Twitter', self.attributes,
                                    self.class_vars, self.metas, title_indices=[-2])
     self.statuses_lock.release()
     return corpus
Ejemplo n.º 4
0
 def create_corpus(self, search_author):
     if search_author:
         class_vars = self.authors
         metas = self.metas
     else:
         class_vars = []
         metas = self.metas + self.authors
     return Corpus.from_documents(self.tweets, 'Twitter', self.attributes,
                                  class_vars, metas,
                                  title_indices=[-1])
Ejemplo n.º 5
0
    def search(self, apikey, query, articles_per_query=100):
        """ Searches for articles."""

        er = EventRegistry(apiKey=apikey)
        q = QueryArticles(keywords=query)
        articles = {}
        results = []

        q.setRequestedResult(
            RequestArticlesInfo(
                page=1,
                count=articles_per_query,
                returnInfo=ReturnInfo(
                    articleInfo=ArticleInfoFlags(duplicateList=True,
                                                 concepts=True,
                                                 categories=True,
                                                 location=True,
                                                 socialScore=True,
                                                 sentiment=True,
                                                 dates=True))))
        res = er.execQuery(q)
        articles = res['articles']
        results = articles['results']
        simplized_results = []
        for result in results:
            simplized_result = {}
            author = ''
            if len(result['authors']) == 0:
                author = 'None'
            else:
                author = result['authors'][0]['name']

            simplized_result['title'] = result['title']
            simplized_result['date'] = result['date']
            simplized_result['time'] = result['date']
            simplized_result['dataType'] = result['date']
            simplized_result['author'] = result['date']
            simplized_result['url'] = result['date']
            simplized_result['dateCrawl'] = result['date']
            simplized_result['timeCrawl'] = result['date']
            simplized_result['sentiment'] = result['date']
            simplized_result['body'] = result['date']

            simplized_results += simplized_result

        return Corpus.from_documents(simplized_results,
                                     'EventRegistry',
                                     self.attributes,
                                     self.class_vars,
                                     self.metas,
                                     title_indices=[-1])
Ejemplo n.º 6
0
    def search(self,
               lang,
               queries,
               articles_per_query=10,
               should_break=None,
               on_progress=None):
        """ Searches for articles.

        Args:
            lang(str): A language code in ISO 639-1 format.
            queries(list of str): A list of queries.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.
            on_progress (callable): Callback for progress bar.
        """
        wikipedia.set_lang(lang)

        results = []
        for i, query in enumerate(queries):
            try:
                articles = wikipedia.search(query, results=articles_per_query)
                for j, article in enumerate(articles):
                    if callable(should_break) and should_break():
                        break

                    results.extend(self._get(article, query, should_break))

                    if callable(on_progress):
                        on_progress((i * articles_per_query + j + 1) /
                                    (len(queries) * articles_per_query),
                                    len(results))
            except (wikipedia.exceptions.HTTPTimeoutError, IOError) as e:
                self.on_error(str(e))
                break

            if callable(should_break) and should_break():
                break

        return Corpus.from_documents(results,
                                     'Wikipedia',
                                     self.attributes,
                                     self.class_vars,
                                     self.metas,
                                     title_indices=[-1])
Ejemplo n.º 7
0
 def corpus(*texts):
     metas = [(data.StringVariable('text'), lambda doc: doc['text'])]
     d = [{'text': t} for t in texts]
     c = Corpus.from_documents(d, 'example', [], [], metas, [-1])
     c.set_text_features([metas[0][0]])
     return c
Ejemplo n.º 8
0
 def create_corpus(self):
     return Corpus.from_documents(self.tweets, 'Twitter', self.attributes,
                                  self.class_vars, self.metas,
                                  title_indices=[-1])