Beispiel #1
0
    def load(self, n=1, hold=False):
        self.entities = []
        blobs = list(
            self.storage_client.list_blobs(self.bucket_name,
                                           prefix=self.PREFIX))

        if len(blobs) >= n:
            rng = range(-1, 0 - int(n) - 1, -1)
            for i in rng:
                b = self.bucket.get_blob(blobs[i].name)
                if (b.temporary_hold):
                    now = utcnow()
                    while b.temporary_hold:
                        elapse = utcnow() - now
                        if elapse.seconds >= 10:
                            b.temporary_hold = False
                            b.patch()
                        b = self.bucket.get_blob(blobs[i].name)

                pickle_load = b.download_as_bytes()
                e = pickle.loads(pickle_load)
                e.house_keeping()

                a = Accumulator.A(e, b)
                self.entities.append(a)
        else:
            a = self.create_and_store()

            self.entities = self.entities.append(a)

        last_date = self.entities[0].entity.dt
        now = utcnow()
        if last_date.day != now.day:
            a = self.create_and_store()
            self.entities = self.entities.append(a)
        def process_source(source, i):
            # build query
            start = utcnow()
            query = dict(provider={'$in': providers},
                         year_month={'$in': source.year_month},
                         _id={'$ne': source._id},
                         )

            if source.children_airlines[0] != '*':
                query['children_airlines'] = {'$in': source.children_airlines + ['*']}

            query_od = dict((k, {'$in': source[k] + ['*']})
                            for k in ('origin_city_airports', 'destination_city_airports')
                            if source[k][0] != '*')

            if query_od:
                query_od_return = dict((k, {'$in': source[r] + ['*']})
                                       for k, r in (('origin_city_airports', 'destination_city_airports'),
                                                    ('destination_city_airports', 'origin_city_airports'))
                                       if source[r][0] != '*')
                if not source.both_ways:
                    query_od_return['both_ways'] = True

                query['$or'] = [query_od, query_od_return]

            update = {'$addToSet': dict(overlap=source._id)}
            result = External_Segment_Tmp.update(query, update, multi=True)
            end = utcnow()
            log.info('Update overlaps %d (%ss) - %r', i, end-start, result)
    def send_messages_to_table(self):
        try:
            with self.info_table.batch_writer() as batch:
                for entry in self.commodity_storage:
                    self.insert_into_table(batch, entry, "c", self.commodity_storage[entry][0], self.commodity_storage[entry][1])

                for entry in self.shipyard_storage:
                    self.insert_into_table(batch, entry, "s", self.shipyard_storage[entry][0], self.shipyard_storage[entry][1])

                for entry in self.outfitting_storage:
                    self.insert_into_table(batch, entry, "o", self.outfitting_storage[entry][0], self.outfitting_storage[entry][1])

                for entry in self.blackmarket_storage:
                    self.insert_into_table(batch, entry, "b", self.blackmarket_storage[entry][0], self.blackmarket_storage[entry][1])

                for entry in self.journal_storage:
                    self.insert_into_table(batch, entry, "j", self.journal_storage[entry][0], self.journal_storage[entry][1])

        except Exception as ex:
            logger.exception('Exception encountered in sending, sending to bad data for retry')
            with self.error_table.batch_writer() as batch:
                for entry in self.commodity_storage:
                    batch.put_item(Item={
                        'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                        'badData': entry,
                        'errorCause': repr(ex),
                        'source': 'DynamoRawSend'
                        })
                for entry in self.shipyard_storage:
                    batch.put_item(Item={
                        'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                        'badData': entry,
                        'errorCause': repr(ex),
                        'source': 'DynamoRawSend'
                        })
                for entry in self.outfitting_storage:
                    batch.put_item(Item={
                        'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                        'badData': entry,
                        'errorCause': repr(ex),
                        'source': 'DynamoRawSend'
                        })
                for entry in self.blackmarket_storage:
                    batch.put_item(Item={
                        'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                        'badData': entry,
                        'errorCause': repr(ex),
                        'source': 'DynamoRawSend'
                        })
                for entry in self.journal_storage:
                    batch.put_item(Item={
                        'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                        'badData': entry,
                        'errorCause': repr(ex),
                        'source': 'DynamoRawSend'
                        })
Beispiel #4
0
def insert_post_2():
    iname = check_cookie()
    if iname is False: return 'AUTHENTICATION FAILED. PLEASE LOGIN FIRST'
    title = request.forms.get('title')
    body = request.forms.get('description')
    pid = cdb.insert_post(iname, title, body, utils.utcnow())
    redirect(HOME + '_select_post/' + str(pid))
Beispiel #5
0
    def execute(self, now=None):
        now = now or utcnow()
        ts = now.replace(
            minute=(now.minute // 10) * 10, second=0, microsecond=0
        )

        self.log_node.check_data(now)
        all_events = self.log_node.parsed_data
        if not all_events:
            return
        interesting_events = self._get_interesting_events(all_events)
        formatted_events = set(self._get_formatted_events(interesting_events))
        with NamedTemporaryFile() as f:
            with GzipFile(fileobj=f) as gz_f:
                writer = csv.writer(gz_f)
                writer.writerow(OUTPUT_FIELDNAMES)
                writer.writerows(formatted_events)
            f.flush()

            remote_path = self.api.send_file(
                self.data_type,
                f.name,
                ts,
                suffix='{:04}'.format(now.minute * 60 + now.second)
            )
            if remote_path is not None:
                data = {'path': remote_path, 'log_type': self.data_type}
                self.api.send_signal('logs', data=data)

        self.log_node.parsed_data = []
def insert_post_2():
    iname = check_cookie();
    if iname is False: return 'AUTHENTICATION FAILED. PLEASE LOGIN FIRST';
    title = request.forms.get('title');
    body = request.forms.get('description');
    pid = cdb.insert_post(iname, title, body, utils.utcnow());
    redirect(HOME + '_select_post/' + str(pid));
Beispiel #7
0
def add_word():
    form = AddWordForm()
    if form.validate_on_submit():
        entity = EntityStream.query.filter_by(caption=form.name.data).first()
        if entity == None:
            entity = EntityStream()
            entity.eid = form.name.data + '-' + id_generator()
            entity.creator = current_user.username
            entity.creation_time = utcnow()
            entity.caption = form.name.data
            entity.alias = form.alias.data
            entity.description = form.description.data
            for s in prefix_suggestion(entity.caption):
                searchkey = SearchKey.query.filter_by(word=s).first()
                if searchkey == None:
                    searchkey = SearchKey(s)
                else:
                    print "found searchkey", searchkey.word, searchkey.id
                entity.suggestion.append(searchkey)
        db.session.add(entity)
        #         db.session.commit()
        flash('The new word has been created.')
        #         else:
        #             LOG("add_word(): entity found in db")
        return redirect(url_for('.go_entity', name=entity.eid))

    return render_template('add_word.html', form=form)
Beispiel #8
0
def cert_info(hostname, verbose=False):
    # http://stackoverflow.com/questions/30862099/how-can-i-get-certificate-issuer-information-in-python
    ret = {'hostname': hostname,
           'results': {}}
    if not hostname:
        ret['results'] = "HOVERBOARDS DON'T WORK ON WATER."
        return ret
    timeout = 1 # seconds
    try:
        LOG.info("fetching certificate info for %r", hostname)
        ctx = ssl.create_default_context()
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE
        s = ctx.wrap_socket(socket.socket(), server_hostname=hostname)
        s.settimeout(timeout)
        s.connect((hostname, 443))
        cert = s.getpeercert()
        LOG.debug("got: %r", cert)
        if not cert:
            ret['results'] = 'no results'
            return ret

        now = utils.utcnow()
        subject = dict(x[0] for x in cert['subject'])
        issuer = dict(x[0] for x in cert['issuer'])
        starts = parse(cert['notBefore'])
        ends = parse(cert['notAfter'])
        struct = {
            'issued_to': subject['commonName'],
            'issued_by': issuer['commonName'],

            'starts': starts,
            'starts_offset': (now - starts).days,        
            'ends': ends,
            'ends_offset': (ends - now).days,
        }
        if verbose:
            struct['raw'] = cert

        ret['results'] = struct
        return ret

    except socket.timeout as err:
        LOG.error("failed to fetch certificate, connection timed out after %s seconds" % timeout)
        ret['results'] = 'timed out'

    except socket.error:
        LOG.error("failed to fetch certificate, connection was refused. possibly no SSL configured")
        ret['results'] = 'refused'
        
    except ssl.SSLError as err:
        LOG.error("failed to fetch certificate for %r", hostname)
        ret['results'] = err.reason
    
    except:
        LOG.exception("unhandled exception attempting to fetch certificate for hostname %r", hostname)
        raise

    return ret
Beispiel #9
0
 def expired(self):
     """
     Si la última actualización de una posición fue hace más de una determinada cantidad de tiempo,
     se considera que dicha posición ya no es válida
     """
     return False
     expiration_date = self.last_updated + timedelta(minutes=TIMEDELTA)
     return utcnow() > expiration_date
Beispiel #10
0
 def get_object(self, request, article_status, since):
     return {
         # used to conveniently generate the reverse url
         'original': {'article_status': article_status,
                      'since': since},
         'article_status': tuple(article_status.split('+')),
         'since': utils.utcnow() - timedelta(days=int(since)),
     }
Beispiel #11
0
    def load_channels(self):
        """
        Loads the channels and tools given the plugin path specified

        :return: The loaded channels, including a tool channel, for the tools found.
        """
        channels = []

        # Try to get channels
        for channel_name in self.channel_names:
            channel_path = os.path.join(self.path, "channels")
            sys.path.append(self.path)
            mod = imp.load_module(
                channel_name, *imp.find_module(channel_name, [channel_path]))
            cls = getattr(mod, channel_name.title().replace("_", ""))
            channel_id = channel_name.split("_")[0]
            # TODO: what about up_to_timestamp?
            try:
                channels.append(cls(channel_id, up_to_timestamp=None))
            except TypeError:
                channels.append(cls(channel_id))

        # Try to get tools
        if self.has_tools:
            tool_path = os.path.join(self.path, "tools")
            # Create a tool channel using this path
            channel_id = self.channel_id_prefix + "_" + "tools"
            channel = ToolChannel(channel_id,
                                  tool_path,
                                  up_to_timestamp=utcnow())
            channels.append(channel)

        if self.has_assets:
            assset_path = os.path.join(os.path.abspath(self.path), "assets")
            channel_id = self.channel_id_prefix + "_" + "assets"
            channel = AssetsChannel2(channel_id,
                                     assset_path,
                                     up_to_timestamp=utcnow())
            channels.append(channel)
            #
            # from . import TimeInterval
            # channel.streams.values()[0].window(TimeInterval.up_to_now()).items()

        return channels
Beispiel #12
0
    def run(self):
        while not self.stop_event.is_set():
            now = utcnow()
            try:
                self.execute(now=now)
            except requests_exceptions.RequestException as e:
                # catch any exception from the requests library
                logging.exception('persistent communication problem: %s', e)

            # Before we sleep, check if the stop_event is set
            if self.stop_event.is_set():
                break

            # now sleep for the service's interval
            time_taken = (utcnow() - now).total_seconds()
            delay_sec = max(0, self.poll_seconds - time_taken)
            sleep(delay_sec)

        logging.info('Service stopped')
Beispiel #13
0
 def check_data(self, now=None):
     # no handler -> nothing we can do
     if not HAS_PSUTIL:
         return
     # we only want to gather data once a minute
     now = now or utcnow()
     if (now - self.last_send) >= SEND_DELTA:
         logging.info('gathering stats')
         # get stats, store in data
         data = [dumps(self._gather(), sort_keys=True)]
         self.flush_data(data, now)
Beispiel #14
0
 def check_data(self, now=None):
     # no handler -> nothing we can do
     if not HAS_PSUTIL:
         return
     # we only want to gather data once a minute
     now = now or utcnow()
     if (now - self.last_send) >= SEND_DELTA:
         logging.info('gathering stats')
         # get stats, store in data
         data = [dumps(self._gather(), sort_keys=True)]
         self.flush_data(data, now)
Beispiel #15
0
    def execute(self, now=None):
        # We use the call time to determine query parameters and for the
        # remote storage location.
        now = now or utcnow()
        now = now.replace(tzinfo=utc)
        self.last_poll = self.last_poll or now
        ts = now.replace(minute=(now.minute // 10) * 10,
                         second=0,
                         microsecond=0)

        # Check the configuration and display helpful error messages
        if not self._validate_configuration():
            logging.error('Invalid configuration, could not start')
            return

        # Activate the pxGrid session
        if not self._activate():
            logging.warning('Activate request failed')
            return

        # Get the session service information
        peer_node_name, base_url = self._lookup_service()
        secret = self._get_secret(peer_node_name)

        # Do the query (starting one tick after the last poll) and save the
        # most recent timestamp for next time.
        start_dt = self.last_poll + TICK_DELTA
        sessions = self._query_sessions(base_url, start_dt, secret)
        if not sessions:
            logging.info('No sessions since %s', self.last_poll)
            return

        # Normalize the data and send it out
        normalized_sessions = self._normalize_sessions(sessions)
        with NamedTemporaryFile() as f:
            with GzipFile(fileobj=f) as gz_f:
                writer = DictWriter(gz_f, fieldnames=OUTPUT_FIELDNAMES)
                writer.writeheader()
                writer.writerows(normalized_sessions)
            f.flush()

            remote_path = self.api.send_file(
                DATA_TYPE,
                f.name,
                ts,
                suffix='{:04}'.format(now.minute * 60 + now.second))
            if remote_path is not None:
                data = {'path': remote_path, 'log_type': DATA_TYPE}
                self.api.send_signal('logs', data=data)

        # Save the last poll time
        self.last_poll = max(dt_parse(s['timestamp']) for s in sessions)
Beispiel #16
0
    def create_and_store(self):
        entity = Accumulator_Entity()
        entity.dt = ceil_dt(utcnow(), 15)

        filename = self.get_filename(entity.dt)

        blob = self.bucket.blob(filename)
        pickle_dump = pickle.dumps(entity)
        blob.upload_from_string(data=pickle_dump)

        a = Accumulator.A(entity, blob)

        return a
Beispiel #17
0
    def execute(self, now=None):
        logging.info('Checking for Suricata alerts')
        self._rotate_logs()
        self._upload(now, compress=True)

        # ideally we'll update this to use the e-tag, for more responsive
        # config updates. But for now, do it at the start of every day.
        # we call utcnow() again to avoid the race condition where we miss
        # midnight.
        next_time = utcnow() + timedelta(seconds=UPDATE_INTERVAL_SECONDS)
        need_ruleset_update = (now and next_time.date() > now.date())
        if need_ruleset_update:
            self._update_rules()
Beispiel #18
0
    def execute(self, now=None):
        logging.info('Checking for Suricata alerts')
        self._rotate_logs()
        self._upload(now, compress=True)

        # ideally we'll update this to use the e-tag, for more responsive
        # config updates. But for now, do it at the start of every day.
        # we call utcnow() again to avoid the race condition where we miss
        # midnight.
        next_time = utcnow() + timedelta(seconds=UPDATE_INTERVAL_SECONDS)
        need_ruleset_update = (now and next_time.date() > now.date())
        if need_ruleset_update:
            self._update_rules()
    def execute(self, now=None):
        logging.info('Checking for Suricata alerts')
        self._rotate_logs()
        self._upload(now, compress=True)

        # ideally we'll update this to use the e-tag, for more responsive
        # config updates. But for now, do it at the start of every day.
        # we call utcnow() again to avoid the race condition where we miss
        # midnight.
        next_time = utcnow() + timedelta(seconds=UPDATE_INTERVAL_SECONDS)
        should_update = (now and next_time.date() > now.date())
        if (not os.path.exists(SURICATA_RULE_PATH)) or should_update:
            logging.info('Updating Suricata rules')
            self._update_rules()
            logging.info('Finished updating Suricata rules')
Beispiel #20
0
    def execute(self, now=None):
        for data_type, priority in MESSAGE_MAP.iteritems():
            try:
                params = self.state[data_type]
            except KeyError:
                params = {'time__gt': utcnow().replace(tzinfo=utc).isoformat()}
                self.state[data_type] = params

            messages = self.get_data(data_type, params)
            if not messages:
                continue

            max_time = max(msg['time'] for msg in messages)
            self.state[data_type] = {'time__gt': max_time}
            self.publish(messages, priority)
Beispiel #21
0
    def execute(self, now=None):
        for data_type, priority in MESSAGE_MAP.iteritems():
            try:
                params = self.state[data_type]
            except KeyError:
                params = {'time__gt': utcnow().replace(tzinfo=utc).isoformat()}
                self.state[data_type] = params

            messages = self.get_data(data_type, params)
            if not messages:
                continue

            max_time = max(msg['time'] for msg in messages)
            self.state[data_type] = {'time__gt': max_time}
            self.publish(messages, priority)
 def now_minus(cls,
               weeks=0,
               days=0,
               hours=0,
               minutes=0,
               seconds=0,
               milliseconds=0):
     delta = timedelta(weeks=weeks,
                       days=days,
                       hours=hours,
                       minutes=minutes,
                       seconds=seconds,
                       milliseconds=milliseconds,
                       microseconds=0)
     now = utcnow()
     return TimeInterval(now - delta, now)
def parse_time_tuple(start, end):
    """
    Parse a time tuple. These can be:
      relative in seconds,       e.g. (-4, 0)
      relative in timedelta,     e.g. (timedelta(seconds=-4), timedelta(0))
      absolute in date/datetime, e.g. (datetime(2016, 4, 28, 20, 0, 0, 0, UTC), datetime(2016, 4, 28, 21, 0, 0, 0, UTC))
      absolute in iso strings,   e.g. ("2016-04-28T20:00:00.000Z", "2016-04-28T20:01:00.000Z")
      Mixtures of relative and absolute are not allowed

    :param start: Start time
    :param end: End time
    :type start: int | timedelta | datetime | str
    :type end: int | timedelta | datetime | str
    :return: TimeInterval or RelativeTimeInterval object
    """
    if isinstance(start, int):
        start_time = timedelta(seconds=start)
    elif isinstance(start, timedelta):
        start_time = start
    elif start is None:
        start_time = MIN_DATE
    elif isinstance(start, (date, datetime)):
        start_time = start.replace(tzinfo=UTC)
    else:
        start_time = ciso8601.parse_datetime(start).replace(tzinfo=UTC)

    if isinstance(end, int):
        # TODO: add check for future (negative values) and ensure that start < end
        if not isinstance(start_time, timedelta):
            raise ValueError("Can't mix relative and absolute times")
        end_time = timedelta(seconds=end)
    elif isinstance(end, timedelta):
        if not isinstance(start_time, timedelta):
            raise ValueError("Can't mix relative and absolute times")
        end_time = end
    elif end is None:
        end_time = utcnow()  # TODO: or MAX_DATE?
    elif isinstance(end, datetime):
        end_time = end.replace(tzinfo=UTC)
    else:
        end_time = ciso8601.parse_datetime(end).replace(tzinfo=UTC)

    if isinstance(start_time, timedelta):
        return RelativeTimeInterval(start=start_time, end=end_time)
    else:
        return TimeInterval(start=start_time, end=end_time)
Beispiel #24
0
def store_metric_environment():
    envelope = request.get_json()
    if not envelope:
        msg = 'no Pub/Sub message received'
        print(f'error: {msg}')
        return f'Bad Request: {msg}', 400

    if not isinstance(envelope, dict) or 'message' not in envelope:
        msg = 'invalid Pub/Sub message format'
        print(f'error: {msg}')
        return f'Bad Request: {msg}', 400

    pubsub_message = envelope['message']

    payload = ''
    if isinstance(pubsub_message, dict) and 'data' in pubsub_message:
        payload = base64.b64decode(
            pubsub_message['data']).decode('utf-8').strip()

    if "location:house.basement" in payload:
        print(re.match("temperature\:([0-9]+\.[0-9]+)", payload))
        json_content = {
            "temperature":
            float(
                re.match(".+temperature:([0-9]+\.[0-9]+)",
                         payload).groups()[0]),
            "original_payload":
            payload
        }
        filename = "environment_sensor_basement-" + datetime.now().strftime(
            FORMAT_DATE_DASH)
        create_file(json.dumps(json_content), filename)

        accumulator = Accumulator(app.logger)
        n = utcnow()
        try:
            accumulator.add_temperature(
                n, temp_basement=json_content.get('temperature'))
        except ValueError as ex:
            app.logger.warn(
                "Accumulator - no value to add - content: {} --- {}".format(
                    payload, ex))

    return ('', 204)
Beispiel #25
0
def acc(j):
    accumulator = Accumulator(app.logger)
    n = utcnow()

    if j.get('temperature') is not None:
        j['temperature'] = float(j.get('temperature'))
    if j.get('humidity') is not None:
        j['humidity'] = float(j.get('humidity'))
    if j.get('stove_exhaust_temp') is not None:
        j['stove_exhaust_temp'] = float(j.get('stove_exhaust_temp'))

    try:
        accumulator.add_temperature2(n, value_dict=j)
    except ValueError as ex:
        app.logger.warn(
            "Accumulator - no value to add - content: {} --- {}".format(
                payload, ex))

    return accumulator
Beispiel #26
0
    def execute(self, now=None):
        # Retrieve entries from the log file
        now = now or utcnow()
        self.log_node.check_data(now)

        # We will send data from the previous 10 minute segment
        now_segment = now.replace(minute=(now.minute // 10) * 10,
                                  second=0,
                                  microsecond=0)
        send_segment = now_segment - timedelta(minutes=10)

        # Remove data that came in too late to do anything about
        all_segments = sorted(self.log_node.parsed_data.iterkeys())
        for segment in all_segments:
            if segment < send_segment:
                del self.log_node.parsed_data[segment]

        self._check_point_to_csv(send_segment, now)
        super(CheckPointPusher, self).execute(now=now)
    def execute(self, now=None):
        if not self.logger.handlers:
            return

        for data_type in self.notification_types:
            if data_type not in MESSAGE_MAP:
                continue

            endpoint = MESSAGE_MAP[data_type]['endpoint']
            priority = MESSAGE_MAP[data_type]['priority']

            try:
                params = self.state[data_type]
            except KeyError:
                params = {'time__gt': utcnow().replace(tzinfo=utc).isoformat()}
                self.state[data_type] = params

            messages = self.get_data(endpoint, params)
            if not messages:
                continue

            max_time = max(msg['time'] for msg in messages)
            self.state[data_type] = {'time__gt': max_time}
            self.publish(messages, priority)
def get_data(xlsx_files, year_months):
    """
    Populate the database with data extract in xlsx files. One file per year_month, only one tab per file.
    Back/Forth routes in rows, one column per way.
    :param xlsx_files: dict of file names
    :param year_months: list of strings (YYYY-MM)
    :return:
    """
    global provider, unknown_airports
    now = utcnow()
    airport_replacement = {}
    airport_exclusions = {}

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    for xlsx_f in xlsx_files:  # loop through each file
        previous_data = pd.DataFrame(columns=['origin', 'destination', 'year_month', 'passengers'])
        row_nb = 0
        if "domestic" in xlsx_f:
            perimeter = "domestic"
            full_provider = provider + ' - domestic'
        else:
            perimeter = "international"
            full_provider = provider + ' - intl'
        print('******************** processing Excel file:', xlsx_f)
        xls = format_file(xlsx_f, perimeter)
        all_rows = len(xls.index)
        row_nb = 0

        with External_Segment_Tmp.unordered_bulk(1000, execute_callback=log_bulk) as bulk:
            for row_index, row in xls.iterrows():  # loop through each row (origin, destination) in file
                row_nb += 1
                year_month = row['year_month']
                if year_month not in year_months:
                    continue
                # First the process for domestic files
                if perimeter == "domestic":
                    passengers = int(row['Passengers'])
                    airport_origin = row['From']
                    airport_destination = row['To']
                    if airport_origin in airport_exclusions or airport_destination in airport_exclusions:  # skip exclusions
                        continue
                    if airport_origin in airport_replacement:  # correct the wrong codes
                        airport_origin = airport_replacement.get(airport_origin)
                    if airport_destination in airport_replacement:  # correct the wrong codes
                        airport_destination = airport_replacement.get(airport_destination)
                    if not check_airport(airport_destination, passengers, perimeter):
                        continue
                    if not check_airport(airport_destination, passengers, perimeter):
                        continue

                    if ((previous_data['origin'] == airport_origin) &
                            (previous_data['destination'] == airport_destination)
                            & (previous_data['year_month'] == year_month)).any():
                        new_row = False
                        # Add to Excel file's total_pax the "passenger" integer you get from filtering
                        # previous_data on other columns
                        passengers += int(previous_data['passengers'][
                                              (previous_data['origin'] == airport_origin) &
                                              (previous_data['destination'] == airport_destination) &
                                              (previous_data['year_month'] == year_month)])
                    else:
                        new_row = True
                    dic = dict(provider=full_provider,
                               data_type='airport',
                               airline=['*'],
                               airline_ref_code=['*'],
                               total_pax=passengers,
                               overlap=[],
                               origin=[airport_origin],
                               destination=[airport_destination],
                               year_month=[year_month],
                               raw_rec=dict(row), both_ways=False,
                               from_line=row_index, from_filename=xlsx_f, url=domestic_url)

                    new_data = pd.Series({'origin': airport_origin, 'destination': airport_destination,
                                          'year_month': year_month, 'passengers': passengers}).to_frame()
                    if new_row:
                        previous_data = previous_data.append(new_data.T, ignore_index=True)
                    else:
                        previous_data['passengers'][
                            (previous_data['origin'] == airport_origin) &
                            (previous_data['destination'] == airport_destination) &
                            (previous_data['year_month'] == year_month)] = passengers  # Modify previous_data's pax

                    query = dict((k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider',
                                                       'data_type', 'airline'))
                    bulk.find(query).upsert().update_one({'$set': dic, '$setOnInsert': dict(inserted=now)})
                    if row_nb % 1000 == 0:
                        print('{0:.3g}'.format(float(row_nb) / float(all_rows) * 100) + '%')

                # Now for international files
                else:
                    # Handle missing data, written ".." in the excel files
                    row.replace('..', np.nan, inplace=True)
                    if pd.isnull(row['TotalPax']):
                        continue
                    if pd.isnull(row['PaxIn']):
                        way_in = False
                    else:
                        way_in = True
                        passengers_in = int(row['PaxIn'])
                    if pd.isnull(row['PaxOut']):
                        way_out = False
                    else:
                        way_out = True
                        passengers_out = int(row['PaxOut'])
                    australian_city = row['AustralianPort']
                    other_city = row['ForeignPort']
                    other_country = row['Country']
                    australian_airport = find_airports_by_name(australian_city, 'australian')
                    other_airport = find_airports_by_name(other_city, 'other')
                    # If one of the airports is not recognized by name, store and skip
                    if not australian_airport:
                        check_airport(airport=None, pax=int(row['TotalPax']), perimeter='international',
                                      city=australian_city, country='Australia')
                        continue
                    if not other_airport:
                        check_airport(airport=None, pax=int(row['TotalPax']), perimeter='international',
                                      city=other_city, country=other_country)
                        continue

                    # Only store data if there was an integer in the PaxIn and/or PaxOut
                    if way_in:
                        dic_in = dict(provider=full_provider,
                               data_type='airport',
                               airline=['*'],
                               airline_ref_code=['*'],
                               total_pax=passengers_in,
                               origin=sorted(other_airport),
                               destination=sorted(australian_airport),
                               year_month=[row['year_month']],
                               raw_rec=dict(row), both_ways=False,
                               from_line=row_index, from_filename=xlsx_f, url=domestic_url)
                        query = dict((k, dic_in[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type'))
                        bulk.find(query).upsert().update_one({'$set': dic_in, '$setOnInsert': dict(inserted=now)})

                    if way_out:
                        dic_out = dict(provider=full_provider,
                                  data_type='airport',
                                  airline=['*'],
                                  airline_ref_code=['*'],
                                  total_pax=passengers_out,
                                  origin=sorted(australian_airport),
                                  destination=sorted(other_airport),
                                  year_month=[row['year_month']],
                                  raw_rec=dict(row), both_ways=False,
                                  from_line=row_index, from_filename=xlsx_f, url=domestic_url)
                        query = dict((k, dic_out[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type'))
                        bulk.find(query).upsert().update_one({'$set': dic_out, '$setOnInsert': dict(inserted=now)})
        log.info('stored: %r', bulk.nresult)
Beispiel #29
0
def main(runPath, argv):
    try:
        opts, args = getopt.getopt(argv, 'ip', ['iam_role', 'profile_name'])
    except getopt.GetoptError as err:
        print(repr(err))
        print('EDDNDynamoRaw.py -p <profile name> OR -r')
        sys.exit(2)

    profile_name = 'eddntest'
    for opt, arg in opts:
        if opt in ("-i", "--iam_role"):
            profile_name = ''
        if opt in ("-p", "--profile_name"):
            profile_name = arg

    logging.config.dictConfig({
        'version': 1,
        'disable_existing_loggers': False,

        'formatters': {
            'standard': {
                'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
            },
        },

        'handlers': {
            'default': {
                'class': 'logging.StreamHandler',
                'level': 'INFO',
                'formatter': 'standard',
                'stream': 'ext://sys.stdout'
            },
            'file_handler': {
                'class': 'logging.handlers.RotatingFileHandler',
                'formatter': 'standard',
                'level': 'INFO',
                'filename': 'eddn-dynamo-raw.log',
                'maxBytes': 10485760,
                'backupCount': 5,
                'encoding': 'utf8'
            }
        },

        'loggers': {
            '': {
                'handlers': ['default', 'file_handler'],
                'level': 'INFO',
                'propagate': True
            }
        }
    })
    logger.debug('Logging configured')

    commodity_schema1 = requests.get('http://schemas.elite-markets.net/eddn/commodity/1', headers={'Connection': 'close'}).json()
    logger.info('Obtained commodity schema v1')
    commodity_schema2 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/924c948a0233421e684145a6c40751c5a7a6bef9/schemas/commodity-v2.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained commodity schema v2')
    commodity_schema3 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/commodity-v3.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained commodity schema v3')
    shipyard_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/924c948a0233421e684145a6c40751c5a7a6bef9/schemas/shipyard-v1.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained shipyard schema v1')
    shipyard_schema2 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/shipyard-v2.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained shipyard schema v2')
    outfitting_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/924c948a0233421e684145a6c40751c5a7a6bef9/schemas/outfitting-v1.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained outfitting schema v1')
    outfitting_schema2 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/outfitting-v2.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained outfitting schema v2')
    journal_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/journal-v1.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained journal schema v1')
    blackmarket_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/blackmarket-v1.0.json', headers={'Connection': 'close'}).json()
    logger.info('Obtained blackmarket schema v1')

    if profile_name:
        boto3.setup_default_session(profile_name=profile_name)

    dynamodb = boto3.resource('dynamodb', region_name='eu-west-1')
    logger.info('Connected to Dynamo')

    ioloop.install()
    logger.info('Installed PyZMQ version of Tornado IOLoop')

    context = zmq.Context()
    message_processes = MessageProcessor(commodity_schema1, commodity_schema2, commodity_schema3, shipyard_schema1, shipyard_schema2, outfitting_schema1, outfitting_schema2, journal_schema1, blackmarket_schema1, dynamodb)
    # Ideally the timeout here would be coordinated with keep-alive timing from EDDN
    subscriber = Subscriber(context, random.randint(1500, 1800), message_processes.process_message, message_processes.send_messages)
    while not subscriber.shutdown_signalled:
        try:
            subscriber.start()
        except Exception as ex:
            logger.exception('Exception encountered in communications, listening again')
            bad_data_table.put_item(Item={
                'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                'errorCause': repr(ex),
                'source': 'DynamoRaw'
                })
            sleep(0.001)
Beispiel #30
0
def run(site, lst, date, targets, testsources, calibrators, args):
    if date is None:
        date = datetime.date.today()
    if lst is None:
        utc = utils.utcnow()
        lst = site.utc_to_lst(utc=utc, date=date)
    
    datestr = date.strftime("%b %d, %Y")
    lststr = utils.deg_to_hmsstr(lst*15)[0]
    utc = site.lst_to_utc(lst=lst, date=date)
    utcstr = utils.deg_to_hmsstr(utc*15)[0]
    print "%s\tLST: %s\tUTC: %s\n" % (datestr, lststr, utcstr)
    for srclist in [calibrators, targets, testsources]:
        for src in srclist:
            ra_deg, dec_deg = src.get_posn(lst, date)
            rastr = "R.A. (J2000): %s" % utils.deg_to_hmsstr(ra_deg, 2)[0]
            decstr = "Dec. (J2000): %s" % utils.deg_to_dmsstr(dec_deg, 2)[0]
            print "%-20s%-27s%27s" % (src.name, rastr, decstr)
            try:
                risetime, settime = src.get_rise_set_times(site, date)
            except errors.SourceIsCircumpolar:
                srctypestr = "(%s)" % srclist.name
                print "%-20sSource is circumpolar." % srctypestr
            except errors.SourceNeverRises:
                srctypestr = "(%s)" % srclist.name
                print "%-20sSource never rises." % srctypestr
            except errors.MultipleRiseSets:
                srctypestr = "(%s)" % srclist.name
                print "%-20sMultiple rise/set times?!" % srctypestr
            except:
                srctypestr = "(%s)" % srclist.name
                print "%-20sError! Oops..." % srctypestr
                raise
            else:
                if src.is_visible(site, lst, date):
                    eventstr = "Source sets in %s" % \
                                utils.deg_to_hmsstr(((settime-lst)%24)*15)[0]
                else:
                    eventstr = "Source rises in %s" % \
                                utils.deg_to_hmsstr(((risetime-lst)%24)*15)[0]
                risetosetstr = "Rise to set time: %s" % \
                            utils.deg_to_hmsstr(((settime-risetime)%24)*15)[0]
                riselststr = "Rise (LST): %s" % \
                            utils.deg_to_hmsstr((risetime%24)*15)[0]
                riseutcstr = "Rise (UTC): %s" % \
                            utils.deg_to_hmsstr((site.lst_to_utc(risetime, \
                                        date)%24)*15)[0]
                setlststr = "Set (LST): %s" % \
                            utils.deg_to_hmsstr((settime%24)*15)[0]
                setutcstr = "Set (UTC): %s" % \
                            utils.deg_to_hmsstr((site.lst_to_utc(settime, \
                                        date)%24)*15)[0]
             
                srctypestr = "(%s)" % srclist.name
                print "%-20s%-27s%27s" % (srctypestr, risetosetstr, eventstr)
                print " "*20 + "%-22s%22s" % (riselststr, setlststr)
                print " "*20 + "%-22s%22s" % (riseutcstr, setutcstr)
            if src.notes:
                print ""
                print " "*20 + "NOTES: %s" % src.notes
                print ""
            print ""
Beispiel #31
0
 def __init__(self, app):
     self.app = app
     self.expires_value = time.strftime("%a, %d %b %Y %H:%M:%S UTC", utils.utcnow())
Beispiel #32
0
 def checkpoint(self, now=None):
     self.data = []
     self.last_send = now or utcnow()
Beispiel #33
0
 def __init__(self, app):
     self.app = app
     self.expires_value = time.strftime("%a, %d %b %Y %H:%M:%S UTC",
                                        utils.utcnow())
Beispiel #34
0
    def process_message(self, msg):
        year = utils.utcnow().year
        if self.info_table is None or self.year != year:
            self.info_table = self.dynamodb.Table('eddn-archive-{0}'.format(year))
            self.year = year

        try:
            raw_json = zlib.decompress(msg).decode(encoding='UTF-8')
            try:
                msg_from_json = simplejson.loads(raw_json)
                logger.debug('Raw json {0}'.format(msg_from_json))

                if msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/commodity/1':
                    jsonschema.validate(msg_from_json, self.commodity_schema1)
                    logger.debug('Json passed commodity schema v1 validation')
                    storage = self.commodity_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/commodity/2':
                    jsonschema.validate(msg_from_json, self.commodity_schema2)
                    logger.debug('Json passed commodity schema v2 validation')
                    storage = self.commodity_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/commodity/3':
                    jsonschema.validate(msg_from_json, self.commodity_schema3)
                    logger.debug('Json passed commodity schema v3 validation')
                    storage = self.commodity_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/shipyard/1':
                    jsonschema.validate(msg_from_json, self.shipyard_schema1)
                    logger.debug('Json passed shipyard schema v1 validation')
                    storage = self.shipyard_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/shipyard/2':
                    jsonschema.validate(msg_from_json, self.shipyard_schema2)
                    logger.debug('Json passed shipyard schema v2 validation')
                    storage = self.shipyard_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/outfitting/1':
                    jsonschema.validate(msg_from_json, self.outfitting_schema1)
                    logger.debug('Json passed outfitting schema v1 validation')
                    storage = self.outfitting_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/outfitting/2':
                    jsonschema.validate(msg_from_json, self.outfitting_schema2)
                    logger.debug('Json passed outfitting schema v2 validation')
                    storage = self.outfitting_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/blackmarket/1':
                    jsonschema.validate(msg_from_json, self.blackmarket_schema1)
                    logger.debug('Json passed blackmarket schema v1 validation')
                    storage = self.blackmarket_storage
                elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/journal/1':
                    jsonschema.validate(msg_from_json, self.journal_schema1)
                    logger.debug('Json passed journal schema v1 validation')
                    storage = self.journal_storage
                else:
                    logger.debug('Data returned is not commodity, shipyard, outfitting, blackmarket, journal, ignoring {0}'.format(msg_from_json))
                    return

                timestamp = utils.utcnow()

                if raw_json not in storage:
                    storage[raw_json] = (timestamp, msg_from_json)

                if len(storage) >= 3:
                    self.send_messages()

            except Exception as ex:
                logger.exception('Exception encountered in parsing, listening again')
                self.error_table.put_item(Item={
                    'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                    'badData': raw_json,
                    'errorCause': repr(ex),
                    'source': 'DynamoRawStore'
                    })
                sleep(0.001)

        except Exception as ex:
            logger.exception('Exception encountered in communications, listening again')
            self.error_table.put_item(Item={
                'timestamp': utils.date_to_epoch_micro(utils.utcnow()),
                'errorCause': repr(ex),
                'source': 'DynamoRawDecompress'
                })
            sleep(0.001)
Beispiel #35
0
    author_id = db.Column(db.Integer, db.ForeignKey('users.username'))
    stream_id = db.Column(db.Integer, db.ForeignKey('streams.eid'))
"""

sys.stdout.write('Post\tpid\ttitle\tbody\tbody_html\ttimestamp\tauthor_id\tstream_id\n')

entitymap = defaultdict(set)

with open(sys.argv[1]) as infile:
    for line in infile:
       fields = line.strip('\r\t\n').split('\t')
       if len(fields)<2: continue
       caption = fields[1]
       eid = fields[0]
       entitymap[caption].add(eid)

with open(sys.argv[2]) as infile2:
    for line in infile2:
#        print line
        fields = line.strip('\r\t\n').split('\t')
        if fields[0] not in entitymap: continue
        for stream in entitymap[fields[0]]:
            pid = 'post-' + id_generator()
            title = fields[1]
            html = fields[2]
            author = 'wikomega'
            streamid = stream
            timestamp = str(utcnow())
            sys.stdout.write('%s\n' % '\t'.join([pid, title, html, author, streamid, timestamp]))
        
Beispiel #36
0
 def checkpoint(self, now=None):
     self.data = []
     self.last_send = now or utcnow()
def get_data(csv_files):
    """
    Populate the database with data extract in csv files
    :return:
    """
    global provider
    airport_replacement = {
        "SBCD": "SSCC",
        "SWUY": "SBUY",
        "SBJI": "SWJI",
        "RJNN": "RJNA",
        "SBPM": "SBPJ",
        "SEQU": "SEQM",
        "SNQY": "SBJU",
        "SJDB": "SBDB",
        "SWJH": "SIZX",
        "SNNG": "SJNP",
        "SDFR": "SDDN",
        "1AON": "SDOW",
        "SMPB": "SMJP",
        "2NHT": "SBTC",
        "SWIQ": "SBMC",
        "SWKK": "SSKW",
        "SAIG": "SARI",
        "SBER": "SWEI"
    }
    airport_exclusions = {"SBNT", "SUPE", "6ASO", "SAMQ"}
    airline_replacements = {"VIP": "FPG", "BLC": "TAM"}

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    for csv_f in csv_files:  # loop through each file
        print('******************** processed csv:  ', csv_f)
        with open('%s/%s' % (tmp_dir, csv_f)) as csv_file:
            dict_reader = csv.DictReader(csv_file)
            row_nb = 0
            previous_data = pd.DataFrame(columns=[
                'origin', 'destination', 'year_month', 'airline', 'passengers'
            ])

            with External_Segment_Tmp.unordered_bulk(
                    1000, execute_callback=log_bulk) as bulk:

                for row in dict_reader:  # loop through each row (origin, destination) in file
                    row_nb += 1
                    for key, value in row.items():
                        if value == ':':
                            row[key] = ''

                    if ((row['PASSAGEIROS PAGOS'] == '0') and (row['PASSAGEIROS PAGOS'] == '0')) or \
                            (row['PASSAGEIROS PAGOS'] == ''):  # skip rows with no pax
                        continue

                    total_pax = int(row['PASSAGEIROS PAGOS']) + int(
                        row['PASSAGEIROS GRÁTIS'])

                    row_airline = get_airline_by_icao(row['EMPRESA (SIGLA)'],
                                                      row['EMPRESA (NOME)'])

                    if row['AEROPORTO DE ORIGEM (SIGLA)'] in airport_exclusions or \
                        row['AEROPORTO DE DESTINO (SIGLA)'] in airport_exclusions:  # skip exclusions
                        continue

                    airport_origin = get_airport_by_icao(
                        row['AEROPORTO DE ORIGEM (SIGLA)'],
                        row['AEROPORTO DE ORIGEM (NOME)'])
                    airport_destination = get_airport_by_icao(
                        row['AEROPORTO DE DESTINO (SIGLA)'],
                        row['AEROPORTO DE DESTINO (NOME)'])
                    if airport_destination is None:
                        continue
                    if airport_origin is None:
                        continue
                    if row_airline in airline_replacements:
                        row_airline = airline_replacements.get(row_airline)
                    if airport_origin in airport_replacement:
                        airport_origin = airport_replacement.get(
                            airport_origin)
                    if airport_destination in airport_replacement:
                        airport_destination = airport_replacement.get(
                            airport_destination)

                    year_month = '%04d-%02d' % (int(row['ANO']), int(
                        row['MÊS']))

                    if ((previous_data['origin'] == airport_origin) &
                        (previous_data['destination'] == airport_destination)
                            & (previous_data['year_month'] == year_month)
                            & (previous_data['airline'] == row_airline)).any():
                        new_row = False
                        total_pax += int(
                            previous_data['passengers'][
                                (previous_data['origin'] == airport_origin)
                                & (previous_data['destination'] ==
                                   airport_destination)
                                & (previous_data['year_month'] == year_month)
                                & (previous_data['airline'] == row_airline)]
                        )  # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns
                    else:
                        new_row = True

                    dic = dict(provider=provider,
                               data_type='airport',
                               airline=row_airline,
                               origin=airport_origin,
                               destination=airport_destination,
                               year_month=year_month,
                               total_pax=total_pax,
                               raw_rec=row,
                               both_ways=False,
                               from_line=row_nb,
                               from_filename=csv_f,
                               url=full_url)

                    new_data = pd.Series({
                        'origin': airport_origin,
                        'destination': airport_destination,
                        'year_month': year_month,
                        'airline': row_airline,
                        'passengers': total_pax
                    }).to_frame()
                    if new_row:
                        previous_data = previous_data.append(new_data.T,
                                                             ignore_index=True)
                    else:
                        previous_data['passengers'][
                            (previous_data['origin'] == airport_origin)
                            & (previous_data['destination'] ==
                               airport_destination) &
                            (previous_data['airline'] == row_airline) &
                            (previous_data['year_month'] == year_month
                             )] = total_pax  # Modify previous_data's pax

                    now = utcnow()
                    query = dict(
                        (k, dic[k])
                        for k in ('origin', 'destination', 'year_month',
                                  'provider', 'data_type', 'airline'))
                    bulk.find(query).upsert().update_one({
                        '$set':
                        dic,
                        '$setOnInsert':
                        dict(inserted=now)
                    })
                    if row_nb % 1000 == 0:
                        print(row_nb / len(dict_reader) * 100, "%")
            log.info('stored: %r', bulk.nresult)
Beispiel #38
0
def get_data(xlsx_files):
    """
    Populate the database with data extract in xlsx files
    :return:
    """
    global provider
    airport_codes = get_airports_codes()
    airport_replacement = {
        "BUE": "EZE",
        "RIO": "GIG",
        "LMC": "LMC",
        "LMA": "MCJ",
        "VGP": "VGZ",
        "PTL": "PTX",
        "MIL": "MXP",
        "LON": "LHR",
        "SAO": "CGH",
        "BSL": "BSL",
        "TRP": "TCD",
        "RLB": "LIR",
        "NYC": "JFK",
        "GTK": "FRS",
        "AWH": "USH",
        "STO": "ARN",
        "WAS": "IAD",
        "BHZ": "PLU"
    }

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    for xlsx_f in xlsx_files:  # loop through each file
        print('******************** processing Excel file:', xlsx_f)
        xls = pd.read_excel(tmp_dir + "/" + xlsx_f)
        header = np.where(
            xls.loc[:, :] == "Pasajeros")[0] + 1  # Look for column names
        xls = pd.read_excel(tmp_dir + "/" + xlsx_f,
                            header=header)  # Re-load file with headers
        xls = format_columns(xls)
        row_nb = 0
        previous_data = pd.DataFrame(
            columns=[
                'origin', 'destination', 'year_month', 'airline', 'passengers'
            ]
        )  # Create a dataframe to save data line after line, so we can check later on

        with External_Segment_Tmp.unordered_bulk(
                1000, execute_callback=log_bulk) as bulk:

            for row in range(
                    0, len(xls)
            ):  # loop through each row (origin, destination) in file
                row_nb += 1
                full_row = xls.iloc[row]
                if np.isnan(
                        full_row['Passengers']
                ) or full_row['Passengers'] == "" or int(
                        full_row['Passengers']) == 0:  # skip rows with no pax
                    continue

                total_pax = int(full_row['Passengers'])
                row_airline = get_airline_by_icao(full_row['Airline'],
                                                  full_row['Airline_Name'])
                if row_airline is None:
                    continue
                airport_origin = full_row['Origen']
                airport_destination = full_row['Destino']
                if airport_origin in airport_replacement:  # correct the wrong codes
                    airport_origin = airport_replacement.get(airport_origin)
                if airport_destination in airport_replacement:  # correct the wrong codes
                    airport_destination = airport_replacement.get(
                        airport_destination)
                if airport_destination not in airport_codes:
                    unknown_airports.add(airport_destination + ":" +
                                         str(full_row['Airport_Destination']) +
                                         ":" + str(full_row['Pais Destino']))
                    continue
                if airport_origin not in airport_codes:
                    unknown_airports.add(airport_origin + ":" +
                                         str(full_row['Airport_Origin']) +
                                         ":" + str(full_row['Pais Origen']))
                    continue
                year_month = full_row['Year_Month']

                if ((previous_data['origin'] == airport_origin) &
                    (previous_data['destination'] == airport_destination)
                        & (previous_data['year_month'] == year_month)
                        & (previous_data['airline'] == row_airline)).any():
                    new_row = False
                    # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns
                    total_pax += int(previous_data['passengers'][
                        (previous_data['origin'] == airport_origin)
                        & (previous_data['destination'] == airport_destination)
                        & (previous_data['year_month'] == year_month)
                        & (previous_data['airline'] == row_airline)])

                else:
                    new_row = True

                dic = dict(provider=provider,
                           data_type='airport',
                           airline=row_airline,
                           origin=airport_origin,
                           destination=airport_destination,
                           year_month=year_month,
                           total_pax=total_pax,
                           raw_rec=full_row.to_json(),
                           both_ways=False,
                           from_line=row_nb,
                           from_filename=xlsx_f,
                           url=full_url)

                new_data = pd.Series({
                    'origin': airport_origin,
                    'destination': airport_destination,
                    'year_month': year_month,
                    'airline': row_airline,
                    'passengers': total_pax
                }).to_frame()
                if new_row:
                    previous_data = previous_data.append(new_data.T,
                                                         ignore_index=True)
                else:
                    # Update the previous_data data frame with the new passengers count
                    previous_data['passengers'][
                        (previous_data['origin'] == airport_origin)
                        & (previous_data['destination'] == airport_destination)
                        & (previous_data['airline'] == row_airline) &
                        (previous_data['year_month']
                         == year_month)] = total_pax

                now = utcnow()
                query = dict((k, dic[k])
                             for k in ('origin', 'destination', 'year_month',
                                       'provider', 'data_type', 'airline'))
                bulk.find(query).upsert().update_one({
                    '$set':
                    dic,
                    '$setOnInsert':
                    dict(inserted=now)
                })
                if row_nb % 1000 == 0:
                    print(row_nb / len(xls) * 100, "%")
        log.info('stored: %r', bulk.nresult)
Beispiel #39
0
def get_data(xlsx_files, year_months):
    """
    Populate the database with data extract in xlsx files. 4 different tabs, for distinction of national/international
    and scheduled/charter flights. Routes in rows, months in columns.
    :param xlsx_files: dict of file names
    :return:
    """
    global provider
    now = utcnow()
    months = {
        "January": "01",
        "February": "02",
        "March": "03",
        "April": "04",
        "May": "05",
        "June": "06",
        "July": "07",
        "August": "08",
        "September": "09",
        "October": "10",
        "November": "11",
        "December": "12"
    }

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    for xlsx_f in xlsx_files:  # loop through each file
        print('******************** processing Excel file:', xlsx_f)
        xl = pd.ExcelFile(tmp_dir + "/" + xlsx_f)
        # Create a data frame to save data line after line, so we can check later on and add values to each other
        previous_data = pd.DataFrame(
            columns=['origin', 'destination', 'year_month', 'passengers'])

        for tab in xl.sheet_names:  # loop in all sheets of the excel file
            print('Starting', tab, 'tab in the Excel file')
            xls = xl.parse(tab)
            year = int(filter(str.isdigit,
                              xlsx_f))  # Use the renamed file for the year
            header = np.where(xls.loc[:, :] == "PAR DE CIUDADES / CITY PAIR"
                              )[0] + 3  # Look for line with column names
            xls = xl.parse(tab, header=header)  # Re-load file with headers
            xls = format_file(xls)
            xls['tab'] = tab

            with External_Segment_Tmp.unordered_bulk(
                    1000, execute_callback=log_bulk) as bulk:

                for indx, row in xls.iterrows(
                ):  # loop through each row (origin, destination) in file
                    # Skip empty rows (no text in Origin column, or year Total = 0)
                    if isinstance(row['Origin'], float) or row['Total'] == 0:
                        continue
                    # Stop at the end of the table (indicated by "T O T A L")
                    if "".join(row['Origin'].split(" ")).upper() == "TOTAL":
                        break
                    origin = unidecode(row['Origin']).upper()
                    destination = unidecode(row['Destination']).upper()
                    airport_origin = find_airports_by_name(origin, tab)
                    airport_destination = find_airports_by_name(
                        destination, tab)
                    if airport_origin is None:
                        update_unknown_airports(origin, row['Total'])
                        continue
                    if airport_destination is None:
                        update_unknown_airports(destination, row['Total'])
                        continue

                    for colname, colvalue in row.iteritems(
                    ):  # loop through rows
                        # Only look at month columns
                        if colname not in months.keys():
                            continue
                        # skip cells with no pax
                        if np.isnan(colvalue) or colvalue == "" or int(
                                colvalue) == 0:
                            continue
                        year_month = str(year) + "-" + months.get(colname)
                        total_pax = int(colvalue)

                        # Only treat the requested year_months
                        if year_month not in year_months:
                            continue

                        if year_month not in previous_data[
                                'year_month'].values:
                            if External_Segment_Tmp.find_one({
                                    'year_month':
                                    year_month,
                                    'provider':
                                    provider
                            }):
                                log.warning(
                                    "This year_month (%s) already exists for provider %s",
                                    year_month, provider)

                        # For international flights, only keep the airports for which capacity exists on that year_month
                        if 'INT' in tab:
                            airport_origin, airport_destination = get_capa(
                                year_month, airport_origin,
                                airport_destination)
                            if airport_destination is None or airport_origin is None:
                                no_capa.append({
                                    'year_month': year_month,
                                    'origin': origin,
                                    'destination': destination
                                })
                                continue

                        if ((previous_data['origin'] == airport_origin) &
                            (previous_data['destination']
                             == airport_destination) &
                            (previous_data['year_month'] == year_month)).any():
                            new_row = False
                            # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns
                            total_pax += int(previous_data['passengers'][
                                (previous_data['origin'] == airport_origin)
                                & (previous_data['destination'] ==
                                   airport_destination) &
                                (previous_data['year_month'] == year_month)])
                        else:
                            new_row = True

                        dic = dict(
                            provider=provider,
                            data_type='airport',
                            airline=['*'],
                            airline_ref_code=['*'],
                            origin=[', '.join(airport_origin)],
                            destination=[', '.join(airport_destination)],
                            year_month=[year_month],
                            total_pax=total_pax,
                            raw_rec=dict(row),
                            both_ways=False,
                            from_line=indx,
                            from_filename=xlsx_f,
                            url=base_url + end_url)

                        new_data = pd.Series({
                            'origin': airport_origin,
                            'destination': airport_destination,
                            'year_month': year_month,
                            'passengers': total_pax
                        }).to_frame()
                        if new_row:
                            previous_data = previous_data.append(
                                new_data.T, ignore_index=True)
                        else:
                            # Update the previous_data data frame with the new passengers count
                            previous_data['passengers'][
                                (previous_data['origin'] == airport_origin)
                                & (previous_data['destination'] ==
                                   airport_destination) &
                                (previous_data['year_month']
                                 == year_month)] = total_pax

                        query = dict(
                            (k, dic[k])
                            for k in ('origin', 'destination', 'year_month',
                                      'provider', 'data_type', 'airline'))
                        bulk.find(query).upsert().update_one({
                            '$set':
                            dic,
                            '$setOnInsert':
                            dict(inserted=now)
                        })
            log.info('stored: %r', bulk.nresult)
    logging_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=logging_format)
    handler = BackupFileHandler('load_Chili.log', backupCount=20)
    formatter = logging.Formatter(logging_format)
    handler.setFormatter(formatter)

    main_log = logging.getLogger()  # le root handler
    log = logging.getLogger('load_Chili')
    # log.setLevel(logging.INFO)
    log.setLevel(logging.DEBUG)
    log.addHandler(handler)

    log.info('Load files from Chili - %s - %r', __version__, p)

    now = utcnow()
    year_months = p.year_months[0].split(', ')
    year = list(set([int(ym[0:4]) for ym in year_months]))

    file_pattern = p.all_files or 'Trafico-de-Par-de-ciudades-por-Operador'
    get_files(year, file_pattern)

    Model.init_db(def_w=True)

    if file_pattern is not True:
        for type_flight in (
                'Internacional',
                'Nacional',
        ):
            analyse_and_store('%s-%s' % (file_pattern, type_flight))
def get_data(xlsx_files):
    """
    Populate the database with data extract in xlsx files. One file per year_month, only one tab per file.
    Back/Forth routes in rows, one column per way.
    :param xlsx_files: dict of file names
    :return:
    """
    global provider
    now = utcnow()
    months = {
        "January": "01",
        "February": "02",
        "March": "03",
        "April": "04",
        "May": "05",
        "June": "06",
        "July": "07",
        "August": "08",
        "September": "09",
        "October": "10",
        "November": "11",
        "December": "12",
        "Jan": "01",
        "Feb": "02",
        "Mar": "03",
        "Apr": "04",
        "Jun": "06",
        "Jul": "07",
        "Aug": "08",
        "Sep": "09",
        "Sept": "09",
        "Oct": "10",
        "Nov": "11",
        "Dec": "12"
    }
    quarters = {
        "01": "Q1",
        "02": "Q1",
        "03": "Q1",
        "04": "Q2",
        "05": "Q2",
        "06": "Q2",
        "07": "Q3",
        "08": "Q3",
        "09": "Q3",
        "10": "Q4",
        "11": "Q4",
        "12": "Q4"
    }

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    for xlsx_f in xlsx_files:  # loop through each file
        if "domestic" in xlsx_f:
            perimeter = "domestic"
        else:
            perimeter = "international"
        provider_label = provider.get(perimeter)
        print('******************** processing Excel file:', xlsx_f)
        xl = pd.ExcelFile(tmp_dir + "/" + xlsx_f)
        xls = xl.parse()
        # Year_month based on the renamed file. List months of the quarter for the case of international files
        year = int(filter(str.isdigit, xlsx_f)[-4:])
        if perimeter == "domestic":
            month = '%02d' % int(xlsx_f.split('_')[2].split('-')[0])
            year_month = [str(year) + "-" + month]
        else:
            quarter = xlsx_f.split('_')[2].split('-')[0]
            year_month = [
                str(year) + '-' + k for k, v in quarters.items()
                if v == quarter
            ]

        # Look for line with column names
        if perimeter == "domestic":
            header = np.where(
                xls.apply(lambda x: x.astype(str).str.upper().str.replace(
                    " ", "")).loc[:, :] == "CITY1")[0] + 1
        else:
            header = np.where(
                xls.apply(lambda x: x.astype(str).str.upper().str.replace(
                    " ", "")).loc[:, :] == "CITY1")[0][0] + 1
        xls = xl.parse(header=header)  # Re-load file with headers
        xls = format_file(xls, perimeter)

        all_rows = len(xls.index)

        with External_Segment_Tmp.unordered_bulk(
                1000, execute_callback=log_bulk) as bulk:
            for row in range(
                    0, len(xls)
            ):  # loop through each row (origin, destination) in file
                full_row = xls.iloc[row]
                # Stop at the end of the table (indicated by "TOTAL")
                if pd.isnull(
                        full_row['CITY 1']) or full_row['CITY 1'] == "CITY 1":
                    continue
                if isinstance(full_row['ID'], str) and "".join(
                        full_row['ID'].split(" ")).upper() == "TOTAL":
                    break
                # Skip empty rows (no text in Origin column, or year Total = 0)
                if isinstance(full_row['PAX TO 2'],
                              float) and full_row['PAX FROM 2'] == 0:
                    continue
                airport1 = find_airports_by_name(
                    unidecode(full_row['CITY 1']).upper(), perimeter)
                airport2 = find_airports_by_name(
                    unidecode(full_row['CITY 2']).upper(), 'domestic')
                if airport1 is None:
                    update_unknown_airports(full_row['CITY 1'],
                                            full_row['PAX TO 2'],
                                            full_row['PAX FROM 2'])
                    continue
                if airport2 is None:
                    update_unknown_airports(full_row['CITY 2'],
                                            full_row['PAX TO 2'],
                                            full_row['PAX FROM 2'])
                    continue

                # First save data from city 1 to city 2
                dic_to = dict(provider=provider_label,
                              data_type='airport',
                              airline=['*'],
                              airline_ref_code=['*'],
                              origin=sorted(airport1),
                              destination=sorted(airport2),
                              year_month=year_month,
                              total_pax=int(full_row['PAX TO 2']),
                              overlap=[],
                              raw_rec=dict(full_row),
                              both_ways=False,
                              from_line=row,
                              from_filename=xlsx_f,
                              url=full_url)
                query = dict((k, dic_to[k])
                             for k in ('origin', 'destination', 'year_month',
                                       'provider', 'data_type', 'airline'))
                bulk.find(query).upsert().update_one({
                    '$set':
                    dic_to,
                    '$setOnInsert':
                    dict(inserted=now)
                })

                # Then save data from city 2 to city 1
                dic_from = dict(provider=provider_label,
                                data_type='airport',
                                airline=['*'],
                                airline_ref_code=['*'],
                                origin=sorted(airport2),
                                destination=sorted(airport1),
                                year_month=year_month,
                                total_pax=int(full_row['PAX FROM 2']),
                                overlap=[],
                                raw_rec=dict(full_row),
                                both_ways=False,
                                from_line=row,
                                from_filename=xlsx_f,
                                url=full_url)
                query = dict((k, dic_from[k])
                             for k in ('origin', 'destination', 'year_month',
                                       'provider', 'data_type', 'airline'))
                bulk.find(query).upsert().update_one({
                    '$set':
                    dic_from,
                    '$setOnInsert':
                    dict(inserted=now)
                })
                if row % 100 == 0:
                    print('{0:.3g}'.format(float(row) / float(all_rows) *
                                           100) + '%')
        log.info('stored: %r', bulk.nresult)
Beispiel #42
0
def update_routes(csv_file, year_months):
    """
    Save new records in External_Segment collection
    """
    now = utcnow()

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    log.info('Updating db with contents of %s...', csv_file)
    xls = pd.read_csv(tmp_dir + '/' + csv_file, sep=',', skiprows=[0, 1, 2])
    new_columns = xls.columns.values
    new_columns[0] = 'irish_airport'
    new_columns[1] = 'way'
    new_columns[2] = 'other_airport'
    for i, col in enumerate(new_columns[3:len(new_columns)], 3):
        new_columns[i] = col.replace('M', '-')
    xls.columns = new_columns
    xls = xls.replace(' ', np.nan)
    available_year_months = new_columns[3:len(new_columns)].tolist()

    with External_Segment_Tmp.unordered_bulk(
            1000, execute_callback=log_bulk) as bulk:
        for row_index, row in xls.iterrows():
            if pd.notnull(row['irish_airport']):
                irish_airport = row['irish_airport']
            if pd.notnull(row['way']):
                way = row['way']
            if pd.isnull(row['other_airport']):
                continue
            else:
                other_airport = row['other_airport']
            if sum(row[available_year_months]) == 0:
                continue
            for ym in available_year_months:
                # Skip the year_months that are not requested
                if ym not in year_months:
                    continue
                pax = row[ym]

                if way == 1:
                    airport_origin = irish_airport
                    airport_destination = other_airport
                else:
                    airport_origin = other_airport
                    airport_destination = irish_airport

                if not check_airport(airport_origin, pax):
                    continue
                if not check_airport(airport_destination, pax):
                    continue

                dic = dict(provider=provider,
                           data_type='airport',
                           airline=['*'],
                           airline_ref_code=['*'],
                           total_pax=pax,
                           origin=[airport_origin],
                           destination=[airport_destination],
                           year_month=[ym],
                           overlap=[],
                           raw_rec=dict(row),
                           both_ways=False,
                           from_line=row_index,
                           from_filename=csv_file,
                           url=url)
                query = dict((k, dic[k])
                             for k in ('origin', 'destination', 'year_month',
                                       'provider', 'data_type', 'airline'))
                bulk.find(query).upsert().update_one({
                    '$set':
                    dic,
                    '$setOnInsert':
                    dict(inserted=now)
                })
                if row_index % 1000 == 0:
                    print('{0:.3g}'.format(row_index / len(xls.index) * 100) +
                          '%')
    log.info('stored: %r', bulk.nresult)