def _spinup_detectors(self, detectors, start, spinup_time):
        print("spinup detectors")
        for query_start, query_end in daterange(start,
                                                start + spinup_time,
                                                timedelta(hours=24),
                                                ranges=True):
            query = self.es.build_date_query(
                query_start,
                min(query_end, start + spinup_time),
                locations=True,
            )
            query['query']['bool']['must'].append(
                {'term': {
                    'event_related': True
                }})
            print(f'{query_start}:',
                  self.es.n_hits(index=DOCUMENT_INDEX, body=query), 'docs')
            docs = self.es.scroll_through(index=DOCUMENT_INDEX,
                                          body=query,
                                          source=False)
            for doc in docs:
                doc = self.doc_to_namedtuple(doc)
                self.maybe_send_doc_to_detector(doc, detectors, 'spinup')

        for detectors_per_setting in detectors.values():
            for detector in detectors_per_setting.values():
                detector.initialize()
        return detectors
Ejemplo n.º 2
0
 def calculate_lim(self,
                   from_dt,
                   to_dt,
                   x=30,
                   inf=np.inf,
                   array=np.array,
                   percentile=np.percentile):
     """Calculate the limit."""
     assert isinstance(from_dt, date)
     assert isinstance(to_dt, date)
     if self.n_docs_per_day:
         values = [
             self.n_docs_per_day[day]
             for day in daterange(from_dt, to_dt, timedelta(days=1))
         ]
         if values:
             values = array(values)
             self.norm_n_docs_per_day = (percentile(values, x) +
                                         percentile(values, 100 - x)) / 2
             if self.norm_n_docs_per_day == 0:
                 limit = inf
             else:
                 normal_gap_between_docs = 24 * 3600 / self.norm_n_docs_per_day
                 limit = normal_gap_between_docs * self.fraction
         else:
             self.norm_n_docs_per_day = 0
             limit = inf
     else:
         self.norm_n_docs_per_day = 0
         limit = inf
     assert limit >= 0
     return limit
Ejemplo n.º 3
0
 def initial_detection(
     self,
     start,
     end,
 ):
     print("Initial detection")
     for query_start, query_end in daterange(start,
                                             end,
                                             timedelta(days=1),
                                             ranges=True):
         query_end = min(query_end, end)
         print("Initial detection:", query_start, "-", query_end)
         query = self.es.build_date_query(
             query_start,
             query_end,
             locations=True,
         )
         query['query']['bool']['must'].append(
             {'term': {
                 'event_related': True
             }})
         documents = self.es.scroll_through(index=DOCUMENT_INDEX,
                                            body=query,
                                            source=False)
         self.event_detector.detect_events_l(documents,
                                             is_real_time=mp.Value(
                                                 c_bool, False),
                                             convert_to_named_tuple=True)
     print("Finished initial detection")
Ejemplo n.º 4
0
def sample_per_day_per_adm_languages(languages, max_count=1):

    start_query = datetime(2014, 7, 29)
    end_query = datetime(2018, 11, 20)
    days = list(
        daterange(start_query,
                  end_query,
                  timedelta(days=1),
                  include_last=False))

    for language in languages:
        print(language)

        query = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'source.lang': language
                        }
                    }, {
                        'exists': {
                            'field': 'text'
                        }
                    }, {
                        'term': {
                            'source.retweet': False
                        }
                    }]
                }
            }
        }
        print(es.n_hits(index=index, body=query))

        while True:
            pg.cur.execute(
                """
                SELECT COUNT(*) FROM classification WHERE language_code = %s
            """, (language, ))
            count = pg.cur.fetchone()[0]
            print(count, '\r')
            if count >= max_count:
                break
            # pick random day
            day = choice(days)
            all_adm = []
            print(day)
            for level in ('level_0', 'level_1'):
                query = {
                    "size": 0,
                    "query": {
                        "bool": {
                            "must": [{
                                "term": {
                                    "source.lang": language
                                }
                            }, {
                                "range": {
                                    "date": {
                                        "gte": day.isoformat(),
                                        "lt":
                                        (day + timedelta(days=1)).isoformat()
                                    }
                                }
                            }, {
                                "term": {
                                    "source.retweet": False
                                }
                            }]
                        }
                    },
                    "aggs": {
                        'adm': {
                            "terms": {
                                "field": f"locations.{level}_region",
                                "size": 500_000
                            }
                        }
                    }
                }

                res = es.search(index=index, body=query)['aggregations']['adm']
                assert res['doc_count_error_upper_bound'] == 0
                all_adm.extend([(bucket['key'], level)
                                for bucket in res['buckets']])

            print(all_adm)

            # check if list not empty
            if all_adm:
                adm, level = choice(all_adm)

                tweet = get_tweet(day, day + timedelta(days=1), adm, level,
                                  language)
                print(day, all_adm)
                if tweet:
                    tweet_id, text, date = tweet

                    pg.cur.execute(
                        """
                        INSERT INTO classification (id, txt, date, language_code)
                        VALUES (%s, %s, %s, %s)
                        ON CONFLICT DO NOTHING
                    """, (tweet_id, text, date, language))

            pg.conn.commit()
        pg.conn.commit()