Ejemplo n.º 1
0
def main():

    logger = data_log.get_logger("twitter_data")
    todays_date = datetime.now().strftime("%m-%d-%Y_%H_%M_%S")
    start_time = time.time()
    credentials = get_twitter_credentials(logger)
    topics = get_topics(logger)
    auth_headers, auth_data = format_api_headers(credentials, logger)
    twitter_data = get_twitter_data(auth_headers, auth_data, topics, logger)

    credentials = db_utils.get_db_credentials(logger)
    db_client = db_utils.connect_db(logger, credentials)

    if len(twitter_data) > 0:
        if db_client != None:
            ret_status = db_utils.push_data_db(logger, db_client, "twitter",
                                               twitter_data)
        else:
            logger.info("Error: Twitter Data not uploaded to Database")

        with open("{0}out/twitter_data_{1}".format(base_dir, todays_date),
                  "wb") as fp:
            pickle.dump(twitter_data, fp)
        logger.info("Stored Twitter Data at {0}".format(
            "{0}out/twitter_data_{1}".format(base_dir, todays_date)))

        logger.info("Twitter Data Nos: {0} rows, Size: {1} bytes".format(
            len(twitter_data), sys.getsizeof(twitter_data)))
        logger.info("Twitter Execution Sec: {}".format(
            (time.time() - start_time)))
    else:
        logger.info("Twitter Data Nos: {0} rows, Size: {1} bytes".format(
            len(twitter_data), sys.getsizeof(twitter_data)))
        logger.info("Twitter Data Nothing to store.")
    return 0
Ejemplo n.º 2
0
def selecionarUsuario(idUser):
    try:
        conexao = db_utils.connect_db()
    except:
        return 'Nao conectou ao banco', error_codes.SERVICE_UNAVAILABLE  #503

    result = usuario_db.seleciona_usuario(idUser, conexao)
    return jsonify(result)
Ejemplo n.º 3
0
def print_operator_codes(connection_txt):
    """Helper function to display operator codes in the console for the user"""

    engine = db_utils.connect_db(connection_txt)
    operator_codes = db_utils.get_lookup_table(engine,
                                               'operators',
                                               index_col='name',
                                               value_col='code')
    operator_code_str = '\n\t-'.join(
        sorted(['%s: %s' % operator for operator in operator_codes.items()]))

    print('Operator code options:\n\t-%s' % operator_code_str)
Ejemplo n.º 4
0
def main(connection_txt, start_date, end_date, table='flight_points', start_time='00:00', end_time='23:59', bbox=None, mask_file=None, mask_buffer_distance=None, clip_output=False, output_path=None, aircraft_info=False, sql_criteria=''):

    if output_path:
        _, path_extension = os.path.splitext(output_path)
        if path_extension not in FIONA_DRIVERS:
            supported_ext = sorted(FIONA_DRIVERS.keys())
            raise ValueError('Unsupported output file type: {extension}. File extension must be either {type_str}'
                             .format(extension=path_extension,
                                     type_str='%s, or %s' % (', '.join(supported_ext[:-1]), supported_ext[-1])
                                     )
                             )
    # If a mask file is given, get the Well-Known Text representation to feed to the query
    mask = None
    if mask_file:
        # Check if the file exists
        if not os.path.isfile(mask_file):
            raise ValueError('mask_file does not exist or is not a file: %s' % mask_file)

        # Check that the file can be read
        _, path_extension = os.path.splitext(mask_file)
        if path_extension not in FIONA_DRIVERS:
            supported_ext = sorted(FIONA_DRIVERS.keys())
            raise ValueError('Unsupported mask_file type: {extension}. File extension must be either {type_str}'
                             .format(extension=path_extension,
                                     type_str='%s, or %s' % (', '.join(supported_ext[:-1]), supported_ext[-1])
                                     )
                             )
        # Make a multi-feature geometry from the mask_file
        mask = gpd.read_file(mask_file).to_crs(epsg=4326)


    engine = db_utils.connect_db(connection_txt)
    data = query_tracks(start_date, end_date, engine=engine, table=table, start_time=start_time, end_time=end_time,
                        bbox=bbox, mask=mask, mask_buffer_distance=mask_buffer_distance, clip_output=clip_output,
                        aircraft_info=aircraft_info, sql_criteria=sql_criteria)

    if output_path:
        datetime_columns = data.columns[data.dtypes == 'datetime64[ns]']
        # convert all datetime cols to str because fiona (underlying GeoPandas) freaks out about datetimes
        for c in datetime_columns:
            data[c] = data[c].astype(str)
        data.to_file(output_path, driver=FIONA_DRIVERS[path_extension])

    return data
Ejemplo n.º 5
0
def main():
    logger.info("Starting DB2 extractor...")

    project_id = os.getenv("GCP_PROJECT")
    secret_id = os.getenv("SECRET_ID")

    db_properties = access_secret_version(project_id, secret_id)
    db_properties = '[db]\n' + db_properties
    config = configparser.RawConfigParser()
    config.read_string(db_properties)

    source_db_host = config.get("db", "source-host")
    source_db_user = config.get("db", "source-user")
    source_db_pass = config.get("db", "source-password")
    source_db_schema = config.get("db", "source-schema")
    source_conn = db_utils.connect_db(source_db_host, source_db_user,
                                      source_db_pass)
    conn = ibm_db_dbi.Connection(source_conn)

    bquery_dataset = 'db2-data'
    bquery_dataset = bquery_dataset.replace("-", "_")

    for table_to_export in tables_to_export:
        table = table_to_export["table"]
        source_schema_table = "{}.{}".format(source_db_schema, table)
        columns = table_to_export["columns"]
        where_clause = table_to_export["where_clause"]
        bquery_table_schema = table_to_export["schema_bigquery"]

        sql = "SELECT {} FROM {}".format(columns, source_schema_table)
        if where_clause:
            sql = "{} WHERE {}".format(sql, where_clause)

        logger.info("Running sql: %s", sql)
        conn = ibm_db_dbi.Connection(source_conn)
        # pandas dataframe with the results
        df = pd.read_sql(sql, conn)
        logger.info("Number of rows in dataframe: %s", df.shape[0])
        response = upload_to_bigquery(df, table.lower(), bquery_dataset,
                                      bquery_table_schema)
        return response
Ejemplo n.º 6
0
def main(connection_txt, url, ssl_cert_path=None, update=False, silent=False):

    engine = db_utils.connect_db(connection_txt)

    where_clause = '' if update else ' WHERE registration NOT IN (SELECT registration FROM aircraft_info)'

    with engine.connect() as conn:
        registration_numbers = pd.read_sql(
            f'''SELECT DISTINCT registration FROM flights{where_clause};''',
            conn).squeeze()
        for registration in registration_numbers:
            was_updated = False
            try:
                was_updated = update_aircraft_info(conn, registration, url,
                                                   ssl_cert_path, update)
            except Exception as e:
                print('Could not update info for %s because %s' %
                      (registration, e))
                continue

            if was_updated and not silent:
                print('Inserted/updated info for %s' % registration)
Ejemplo n.º 7
0
        # print(origin_ip_1)
        get('http://ty-http-d.upupfile.com/index/white/add?neek=tyhttp802165&appkey=f33f8c574ddbc7ab2d517cdc0343f9d4&white={}'
            .format(origin_ip))
        # print('本机外网IP为:', origin_ip)
    except Exception as e:
        print(e)


if __name__ == '__main__':

    while True:
        try:
            commit_ip()
            pre_time = (datetime.now() -
                        timedelta(minutes=0)).strftime('%Y-%m-%d %H:%M:%S')
            connect = connect_db(DB_HOST, DB_USER, DB_PASS, DATABASE, DB_PORT)
            update_sql = 'update proxy set isActive = 0 where updateTime < "{}"'.format(
                pre_time)
            insert_update_drop_data(connect, update_sql, '')
            # url = 'http://http.tiqu.qingjuhe.cn/getip3?num=1&type=1&pack=43560&port=11&yys=100017&lb=1&pb=45&gm=4&regions='
            # url = 'http://183.129.244.16:88/open?user_name=dfbhrehtp1&timestamp=1576810228&md5=53D0A64BB6C69B9C0442827DBEE4188E&pattern=json&number=1'
            # url = 'http://http.tiqu.qingjuhe.cn/getip3?num=1&type=1&pack=43545&port=1&yys=100017&lb=1&pb=45&gm=4&regions='
            # url = 'http://http.tiqu.qingjuhe.cn/getip3?num=3&type=1&pack=44015&port=1&yys=100017&lb=6&sb=%2A%3C--%3E%2A&pb=45&gm=4&regions='
            # url = 'http://http.tiqu.qingjuhe.cn/getip3?num=3&type=1&pack=44015&port=1&lb=6&sb=%2A%3C--%3E%2A&pb=45&gm=4&regions='
            url_1 = 'http://http.tiqu.qingjuhe.cn/getip3?num=3&type=1&pack=45255&port=11&lb=6&sb=%2A%3C--%3E%2A&pb=45&gm=4&regions='
            # result = get_proxy_ip(url)
            # if result == '':
            #     pass
            # else:
            #     print(result)
            #     save_ip(result, connect)
Ejemplo n.º 8
0
def import_data(landings_path, connection_txt, sheet_name='data'):

    df = pd.read_excel(landings_path, sheet_name)

    # The data sheet has formulas for the first 2000 rows, which pandas reads as blank data, so drop those rows
    df.drop(df.index[df.isnull().all(axis=1)], inplace=True)
    validate_landings(df.copy())

    # can't parse_dates when reading file because there's an extra '00:00:00' in the date for some stupid reason
    df['departure_datetime'] = pd.to_datetime(
        df.date.dt.strftime('%Y-%m-%d ') + df.time.apply(str))
    df.drop(['date', 'time'], axis=1, inplace=True)

    # Create a column with a unique value per flight (i.e., row)
    engine = db_utils.connect_db(connection_txt)
    operator_codes = db_utils.get_lookup_table(engine,
                                               'operators',
                                               index_col='name',
                                               value_col='code')
    df.replace({'operator_code': operator_codes}, inplace=True)
    df['flight_id'] = df.apply(
        lambda row: '{operator}_{aircraft}_{departure}'.format(
            operator=row.operator_code,
            aircraft=row.aircraft_type.replace(' ', ''),
            departure=row.departure_datetime.strftime('%Y%m%d_%H%M')),
        axis=1)

    # Separate the flight info from landing data
    flights = df.loc[:, ['flight_id'] + FLIGHT_TBL_COLS]
    fees = df.loc[:, ['flight_id'] + FEE_TBL_COLS]
    landing_data = df.drop(FLIGHT_TBL_COLS + FEE_TBL_COLS, axis=1)

    flights['time_submitted'] = pd.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    flights['submission_method'] = 'email'

    # Transpose the landing data such that each landing/dropoff/pickup is its own line
    landings = landing_data.melt(id_vars=[c for c in landing_data if not c.startswith('n_')],
                                 value_name='n_passengers')\
        .dropna(subset=['n_passengers'])
    landings['landing_type'] = landings.variable.apply(
        lambda x: x.split('_')[1])
    landings['location'] = [
        landings.loc[i, c.replace('n_', 'location_')]
        for i, c in landings.variable.iteritems()
    ]
    location_cols = [c for c in landing_data if c.startswith('location_')]
    landings.drop(location_cols + ['variable'], axis=1, inplace=True)
    landings.location = landings.location.apply(lambda x: x.split(' - ')[0])

    with engine.connect() as conn, conn.begin():
        # Insert only new flights because if there were already tracks for these flights that were already processed,
        #   the flights are already in the DB
        existing_flight_ids = pd.read_sql("SELECT flight_id FROM flights",
                                          conn).squeeze()
        flights.loc[~flights.flight_id.isin(existing_flight_ids)].to_sql(
            'flights', conn, if_exists='append', index=False)

        # Get the IDs of flights that were just inserted
        flight_ids = pd.read_sql(
            "SELECT id, flight_id FROM flights WHERE flight_id IN ('%s')" %
            "', '".join(flights.flight_id), conn)
        fees = fees.merge(flight_ids,
                          on='flight_id').rename(columns={'id': 'flight_id'})
        fees.loc[~fees.flight_id.isin(existing_flight_ids)]\
            .drop('flight_id', axis=1)\
            .to_sql('concession_fees', conn, if_exists='append', index=False)
        landings = landings.merge(
            flight_ids, on='flight_id').rename(columns={'id': 'flight_id'})

        # Remove any notes that were intended for data entry, then replace location names with codes
        locations = pd.read_sql_table('landing_locations', conn)
        location_codes = locations.set_index('name').code.to_dict(
        )  #db_utils.get_lookup_table(engine, 'landing_locations', index_col='name', value_col='code')
        scenic_location_notes = locations.dropna(
            subset=['scenic_data_entry_note'])
        scenic_notes = scenic_location_notes.set_index(
            scenic_location_notes.apply(lambda x: '%s - %s' %
                                        (x['name'], x.scenic_data_entry_note),
                                        axis=1))['name'].to_dict()
        taxi_location_notes = locations.dropna(subset=['taxi_data_entry_note'])
        taxi_notes = taxi_location_notes.set_index(
            taxi_location_notes.apply(lambda x: '%s - %s' %
                                      (x['name'], x.taxi_data_entry_note),
                                      axis=1))['name'].to_dict()

        landings = landings.replace({'location': scenic_notes})\
            .replace({'location': taxi_notes})\
            .replace({'location': location_codes})

        # Remove extraneous columns from landing data before inserting
        landings_cols = pd.read_sql(
            "SELECT column_name FROM information_schema.columns WHERE table_name = 'landings'",
            conn).squeeze()
        landings[[c for c in landings
                  if c in landings_cols.values]].to_sql('landings',
                                                        conn,
                                                        if_exists='append',
                                                        index=False)
Ejemplo n.º 9
0
def import_data(connection_txt=None,
                data=None,
                path=None,
                seg_time_diff=15,
                min_point_distance=200,
                registration='',
                submission_method='manual',
                operator_code=None,
                aircraft_type=None,
                silent=False,
                force_import=False,
                ssl_cert_path=None,
                engine=None,
                force_registration=False,
                ignore_duplicate_flights=False,
                **kwargs):

    if type(data) == gpd.geodataframe.GeoDataFrame:
        gdf = data.copy()
    elif path:
        gdf = format_track(path,
                           seg_time_diff=seg_time_diff,
                           min_point_distance=min_point_distance,
                           registration=registration,
                           submission_method=submission_method,
                           operator_code=operator_code,
                           aircraft_type=aircraft_type,
                           force_registration=force_registration)
    else:
        raise ValueError(
            'Either data (a geodataframe) or path (to a valid track file) must be given'
        )

    if not engine and connection_txt:
        engine = db_utils.connect_db(connection_txt)
    elif not engine:
        raise ValueError(
            'Either an SQLAlchemy Engine (from create_engine()) or connection_txt must be given'
        )

    # Recalculate landing time and duration here in case there were edits that changed these
    #   Also drop any segments with only one vertex
    gdf = calculate_duration(gdf)\
        .loc[gdf.duration_hrs > 0]

    # get columns from DB tables
    flight_columns = db_utils.get_db_columns('flights', engine)
    point_columns = db_utils.get_db_columns('flight_points', engine)
    line_columns = db_utils.get_db_columns('flight_lines', engine)
    ''' ############ add submission table #############'''

    # separate flights, points, and lines
    flights = gdf[[c for c in flight_columns if c in gdf]].drop_duplicates()

    flights['end_datetime'] = gdf.groupby('flight_id').ak_datetime.max().values
    # if coming from web app, this should already be in the data so don't overwrite
    #if 'submitter' not in flights.columns:
    #    flights['submitter'] = os.getlogin()
    if 'track_editor' not in flights.columns:
        flights['track_editor'] = os.getlogin()  #flights.submitter
    if path and 'source_file' not in flights.columns:
        flights['source_file'] = os.path.join(ARCHIVE_DIR,
                                              os.path.basename(path))
    if not len(flights):
        raise ValueError('No flight segments found in this file.')

    points = gdf.copy()
    points['geom'] = gdf.geometry.apply(lambda g: WKTElement(g.wkt, srid=4326))
    points.drop(columns=points.columns[~points.columns.isin(point_columns)],
                inplace=True)

    line_geom = gdf.groupby('flight_id').geometry.apply(
        lambda g: shapely_LineString(g.to_list()))
    lines = gpd.GeoDataFrame(flights.set_index('flight_id'),
                             geometry=line_geom)
    lines['geom'] = lines.geometry.apply(
        lambda g: WKTElement(g.wkt, srid=4326))
    lines['flight_id'] = lines.index
    lines.drop(columns=lines.columns[~lines.columns.isin(line_columns)],
               inplace=True)
    lines.index.name = None

    with engine.connect() as conn, conn.begin():

        # Insert only new flights. Check for new flights by looking for flight points from the same registration number
        #   that are within the start and end times of each flight segment (since an aircraft can't be in 2 locations
        #   at the same time).
        existing_flight_info = []
        existing_flight_ids = []
        for _, f in flights.iterrows():
            matching_flights = check_duplicate_flights(f.registration, conn,
                                                       f.departure_datetime,
                                                       f.end_datetime)
            existing_flight_info.extend([
                (m.registration, m.departure_datetime)
                for _, m in matching_flights.iterrows()
            ])
            existing_flight_ids.extend(matching_flights.flight_id)
        if len(existing_flight_info
               ) and not force_import and not ignore_duplicate_flights:
            existing_str = '\n\t-'.join(
                ['%s: %s' % f for f in existing_flight_info])
            raise ValueError(
                'The file {path} contains flight segments that already exist in the database as'
                ' indicated by the following registration and departure times:\n\t-{existing_flights}'
                '\nEither delete these flight segments from the database or run this script again with'
                ' the --force_import flag (ONLY USE THIS FLAG IF YOU KNOW WHAT YOU\'RE DOING).'
                .format(path=path, existing_flights=existing_str))

        new_flights = flights.loc[~flights.flight_id.isin(existing_flight_ids)]
        new_flights.drop(columns='end_datetime')\
            .to_sql('flights', conn, if_exists='append', index=False)

        # Warn the user if any of the flights already exist
        n_flights = len(flights)
        n_new_flights = len(new_flights)
        if n_new_flights == 0:
            raise ValueError(
                'No new flight segments were inserted from this file because they all already exist in'
                ' the database.')
        if n_flights != n_new_flights:
            warnings.warn(
                'For the file {path}, the following {existing} of {total} flight segments already exist:'
                '\n\t- {ids}'.format(path=path,
                                     existing=n_flights - n_new_flights,
                                     total=n_flights,
                                     ids='\n\t-'.join(existing_flight_ids)))

        # Get the numeric IDs of the flights that were just inserted and insert the points and lines matching those
        #   flight IDs that were just inserted
        flight_ids = pd.read_sql(
            "SELECT id, flight_id FROM flights WHERE flight_id IN ('%s')" %
            "', '".join(flights.flight_id), conn)
        points = points.merge(flight_ids, on='flight_id')
        points.loc[~points.flight_id.isin(existing_flight_ids)]\
            .drop('flight_id', axis=1) \
            .rename(columns={'id': 'flight_id'})\
            .to_sql('flight_points',
                    conn,
                    if_exists='append',
                    index=False,
                    dtype={'geom': Geometry('POINT Z', srid=4326)})
        lines = lines.merge(flight_ids, on='flight_id')
        lines.loc[~lines.flight_id.isin(existing_flight_ids)]\
            .drop('flight_id', axis=1) \
            .rename(columns={'id': 'flight_id'})\
            .to_sql('flight_lines',
                    conn,
                    if_exists='append',
                    index=False,
                    dtype={'geom': Geometry('LineStringZ', srid=4326)})

        # INSERT info about this aircraft if it doesn't already exist. If it does, UPDATE it if necessary
        #   disable because this happens now as a separate scheduled task
        if ssl_cert_path:
            ainfo.update_aircraft_info(conn, registration, ssl_cert_path)  #'''

    # VACUUM and ANALYZE clean up unused space and recalculate statistics to improve spatial query performance. Attempt
    #   to run these commands on both spatial tables, but if they fail, just warn the user since it's not that big of
    #   a deal
    try:
        with engine.execution_options(
                isolation_level='AUTOCOMMIT').connect() as conn:
            conn.execute('VACUUM ANALYZE flight_points;')
            conn.execute('VACUUM ANALYZE flight_lines;')
    except:
        warnings.warn(
            "Unable to VACUUM and ANALYZE geometry tables. You should connect to the database and manually"
            " run 'VACUUM ANALYZE flight_points' and 'VACUUM ANALYZE flight_lines;' to ensure queries are as"
            " effecient as possible")

    # Archive the data file
    if not os.path.isdir(ARCHIVE_DIR):
        try:
            os.mkdir(ARCHIVE_DIR)
        except:
            pass
    if os.path.isdir(os.path.dirname(path)):
        try:
            shutil.copy(path, ARCHIVE_DIR)
            os.remove(path)
        except Exception as e:
            warnings.warn(
                'Data successfully imported, but could not copy track files because %s. You will have to '
                'manually copy and paste this file to %s' % (e, ARCHIVE_DIR))

    if not silent:
        sys.stdout.write(
            '%d flight %s imported:\n\t-%s' %
            (len(flights), 'tracks' if len(flights) > 1 else 'track',
             '\n\t-'.join(flight_ids.flight_id)))
        sys.stdout.flush()
Ejemplo n.º 10
0
def manual_classes_classifier(data_path, column, language, lemmatize,
                              manual_mappings, manual_classes,
                              predicted_classes_filename, should_upload_db,
                              account_key_path):
    print("Build classifier...")
    with open(manual_classes, encoding="utf8") as json_data:
        manual_classes_dict = json.load(json_data)
    classifier = Classifier(manual_classes_dict, language)
    print("Classifier built")
    print()

    print("Loading data...")
    data_df = load_data(data_path, column)
    print("Loaded data sample")
    print(data_df.head())
    print()

    print("Cleaning data...")
    data_df[column] = clean_data(data_df[column])
    print("Clean data sample")
    print(data_df.head())
    print()

    print("Removing stopwors...")
    data_df[column] = remove_stopwords(data_df[column], language)
    print("Data sample")
    print(data_df.head())
    print()

    if lemmatize:
        print("Lemmatizing data...")
        data_df[column] = lemmatize_text(data_df[column], language)
        print("Lemmatized data sample")
        print(data_df.head())
        print()

    if manual_mappings:
        print("Applying manual mappings...")
        data_df[column] = apply_manual_mappings(data_df[column],
                                                manual_mappings)
        print("Manually mapped data sample")
        print(data_df.head())
        print()

    print("Predict classes...")
    predicted_classes = predict(classifier, data_df[column])
    save_classes(predicted_classes, predicted_classes_filename)
    print("Predicted classes saved to:", predicted_classes_filename)
    print()

    if should_upload_db:
        db_client = connect_db(account_key_path)
        print("Uploading predicted classes to db...")
        upload_db(
            db_client, 'predicted_classes', {
                column:
                json.loads(
                    pd.DataFrame(predicted_classes).to_json(orient='index',
                                                            force_ascii=False))
            })
        print('Done')
        print()
Ejemplo n.º 11
0
def text_analysis(
        data_path,
        column,
        groups,
        language,
        lemmatize,
        ngram_range,
        num_topics,
        num_words,
        manual_mappings,
        generate_word_cloud,
        word_cloud_filename,
        frequent_words_filename,
        frequent_words_plot_filename,
        top_tfidf_words_filename,
        top_tfidf_words_plot_filename,
        predict_topics,
        topics_filename,
        predicted_topics_filename,
        ldavis_filename_prefix,
        predict_sentiment,
        predicted_sentiment_filename,
        should_upload_db,
        account_key_path
):
    print("Loading data...")
    data_df = load_data(data_path, column, groups)
    print("Loaded data sample")
    print(data_df.head())
    print()

    print("Cleaning data...")
    data_df[column] = clean_data(data_df[column])
    print("Clean data sample")
    print(data_df.head())
    print()

    print("Removing stop words from data...")
    data_df[column] = remove_stopwords(data_df[column], language)
    print("Data sample")
    print(data_df.head())
    print()

    if lemmatize:
        print("Lemmatizing data...")
        data_df[column] = lemmatize_text(data_df[column], language)
        print("Lemmatized data sample")
        print(data_df.head())
        print()

    if manual_mappings:
        print("Applying manual mappings...")
        data_df[column] = apply_manual_mappings(data_df[column], manual_mappings)
        print("Manually mapped data sample")
        print(data_df.head())
        print()

    if generate_word_cloud:
        print("Generating word cloud...")
        plot_word_cloud(data_df[column], word_cloud_filename, language)
        print("word_cloud saved to:", word_cloud_filename)
        print()

    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_word_count_pair_list = most_frequent_words(
        count_data, count_vectorizer, count_data.shape[0] + 1
    )
    word_count_pair_list = all_word_count_pair_list[:num_words]

    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_tfidf_pair_list = most_frequent_words(
        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
    )
    tfidf_pair_list = all_tfidf_pair_list[:num_words]

    print("Saving frequent words...")
    save_words(
        all_word_count_pair_list,
        frequent_words_filename
    )
    print("Frequent words saved to:", frequent_words_filename)
    print()

    if should_upload_db:
        db_client = connect_db(account_key_path)
    else:
        db_client = None

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'frequent_words', {
            column: {w: int(c) for w, c in word_count_pair_list}
        })
        print('Done')
        print()

    print("Generating frequent word plot...")
    plot_top_words(word_count_pair_list, frequent_words_plot_filename)
    print("Frequent word plot saved to:", frequent_words_plot_filename)
    print()

    print("Saving top tfidf words...")
    save_words(
        all_tfidf_pair_list,
        top_tfidf_words_filename
    )
    print("Top tfidf words saved to:", top_tfidf_words_filename)
    print()

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'top_tfidf', {
            column: {w: int(c) for w, c in tfidf_pair_list}
        })
        print('Done')
        print()

    print("Generating top tfidf word plot...")
    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename)
    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename)
    print()

    if groups:
        group_unique_vals = {}
        for group in groups:
            group_unique_vals[group] = data_df[group].unique()

        splits = {}
        for group, unique_vals in group_unique_vals.items():
            for val in unique_vals:
                splits[(group, val)] = data_df[group] == val

        for i in range(len(groups) - 1):
            splits = concat_splits(splits)

        grouped_words_counts = {}
        grouped_words_tfidf = {}

        for key, split_idcs in splits.items():
            split = data_df[split_idcs]
            split_texts = split[column]

            if len(split_texts) > 0 and any(split_texts.str.len() > 0):
                word_cloud_filename_val = add_prefix_to_filename(
                    word_cloud_filename, key
                )
                frequent_words_filename_val = add_prefix_to_filename(
                    frequent_words_filename, key
                )
                frequent_words_plot_filename_val = add_prefix_to_filename(
                    frequent_words_plot_filename, key
                )
                top_tfidf_words_filename_val = add_prefix_to_filename(
                    top_tfidf_words_filename, key
                )
                top_tfidf_words_plot_filename_val = add_prefix_to_filename(
                    top_tfidf_words_plot_filename, key
                )

                if generate_word_cloud:
                    print("Generating word cloud...")
                    plot_word_cloud(split_texts, word_cloud_filename_val, language)
                    print("word_cloud saved to:", word_cloud_filename_val)
                    print()

                try:
                    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_word_count_pair_list = most_frequent_words(
                        count_data, count_vectorizer, count_data.shape[0] + 1
                    )
                    word_count_pair_list = all_word_count_pair_list[:num_words]

                    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_tfidf_pair_list = most_frequent_words(
                        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
                    )
                    tfidf_pair_list = all_tfidf_pair_list[:num_words]

                    print("Saving frequent words...")
                    save_words(
                        all_word_count_pair_list,
                        frequent_words_filename_val
                    )
                    print("Frequent words saved to:", frequent_words_filename_val)
                    print()

                    print("Generating frequent word plot...")
                    plot_top_words(word_count_pair_list, frequent_words_plot_filename_val)
                    print("Frequent word plot saved to:", frequent_words_plot_filename_val)
                    print()

                    print("Saving top tfidf words...")
                    save_words(
                        all_tfidf_pair_list,
                        top_tfidf_words_filename_val
                    )
                    print("Top tfidf words saved to:", top_tfidf_words_filename_val)
                    print()

                    print("Generating top tfidf word plot...")
                    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename_val)
                    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename_val)
                    print()

                    grouped_words_counts[key[1::2]] = {
                        w: int(c) for w, c in all_word_count_pair_list
                    }
                    grouped_words_tfidf[key[1::2]] = {
                        w: int(c) for w, c in all_tfidf_pair_list
                    }
                except:
                    print("Error processing", key,
                          "skipping it. texts are probably all stopwords")

        print("Saving grouped frequent words...")
        group_frequent_words_filename = add_prefix_to_filename(
            frequent_words_filename, groups
        )
        remapped_grouped_words_counts = remap_keys(grouped_words_counts, groups)
        with open(group_frequent_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_counts, f, ensure_ascii=False)
        print("Frequent words saved to:", group_frequent_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_counts to db...")
            upload_db(db_client, 'grouped_words_counts', {
                column: remap_to_dict(remapped_grouped_words_counts)
            })
            print('Done')
            print()

        print("Saving grouped top tfidf words...")
        group_top_tfidf_words_filename = add_prefix_to_filename(
            top_tfidf_words_filename, groups
        )
        remapped_grouped_words_tfidf = remap_keys(grouped_words_tfidf, groups)
        with open(group_top_tfidf_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_tfidf, f, ensure_ascii=False)
        print("Top tfidf words saved to:", group_top_tfidf_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_tfidf to db...")
            upload_db(db_client, 'grouped_words_tfidf', {
                column: remap_to_dict(remapped_grouped_words_tfidf)
            })
            print('Done')
            print()

    if predict_topics:
        print("Calculating topic model...")
        lda, predicted_topics = learn_topic_model(tfidf_data, num_topics)
        print("Topics found via LDA:")
        print_topics(lda, tfidf_vectorizer, num_words)
        print("Saving topics...")
        save_topics(lda, tfidf_vectorizer, topics_filename)
        print("Topics saved to:", topics_filename)
        print()

        print("Saving predicted topics...")
        save_predicted_topics(predicted_topics, predicted_topics_filename)
        print("Predicted topics saved to:", predicted_topics_filename)
        print()

        if should_upload_db:
            print("Uploading predicted topics to db...")
            upload_db(db_client, 'predicted_topics', {
                column: json.loads(pd.DataFrame(predicted_topics).to_json(
                    orient='index', force_ascii=False
                ))
            })
            print('Done')
            print()

        print("Generating LDA visualization...")
        visualize_topic_model(lda, count_data, tfidf_vectorizer,
                              num_topics, ldavis_filename_prefix)
        print("LDA visualization saved to:", ldavis_filename_prefix)
        print()

    if predict_sentiment:
        if language == 'it':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_sentita(data_df[column])
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()

        elif language == 'en':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_paralleldots(data_df)
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()
        else:
            print("Sentiment analysis on {} language is not supported")
            print()
Ejemplo n.º 12
0
def query_tracks(start_date, end_date, connection_txt=None, engine=None, table='flight_points', start_time='00:00', end_time='23:59', bbox=None, mask=None, mask_buffer_distance=None, clip_output=False, aircraft_info=False, sql_criteria=''):
    '''
    Query the overflights database with specified parameters. Results are returned as a GeoPandas.GeoDataFrame instance.

    :param start_date:      ISO date string (YYYY-mm-dd) indicating the beginning of the date range to query within 
    :param end_date:        ISO date string (YYYY-mm-dd) indicating the end of the date range to query within
    :param connection_txt:  [optional] path to a text file containing postgres connection params for the overflights DB.
                            The text file must be readable by db_utils.connect_db(). If engine is not given,
                            connection_txt must be specified.
    :param engine:          [optional] SQLAlchemy Engine instance for connecting to the overflights DB. If
                            connection_txt is not given, engine must be specified.
    :param table:           [optional] string representing the name of the table to return geometries from overflights
                            DB. Options are either 'flight_points' (the default) or 'flight_lines'
    :param start_time:      [optional] string representing the earliest time of day on a 24-hour clock to return data
                            from. Must be in the format HH:MM or H:MM (e.g., 09:30 or 9:30) [Default: '00:00']
    :param end_time:        [optional] string representing the latest time of day on a 24-hour clock to return data
                            from. Must be in the format HH:MM or H:MM (e.g., 09:30 or 9:30) [Default: '23:59']
    :param bbox:            [optional] WGS84 bounding box coordinates to query records within in the format 'xmin, ymin,
                            xmax, ymax'. If a mask is specified, the bounding box will be ignored [Default: None]
    :param mask:            [optional] Geopandas.GeoDataframe instance to to spatially filter query results. If you
                            specify a mask with Point or Line geometries, you must also specify a mask_buffer_distance.
                            [Default: None]
    :param mask_buffer_distance: [optional] Integer distance in meters (as measured in Alaska Albers Equal Area Conic
                                 projection) to buffer around all features in mask_file. [Default: None]
    :param clip_output:     [optional] boolean to indicate that the result should be the intersection of mask_file and
                            the result of the non-spatial query criteria. If this option is not given, all features
                            that touch mask_file will be returned, but they will not be clipped to its shape
                            [Default: False]
    :param aircraft_info:   [optional] boolean to return information about the aircraft (manufacturer, model, engine
                            model, aircraft type, etc.) appended to each row of the query result. These additional
                            fields can also be used in the sql_criteria to aspatially filter results based on aircraft information
                            (e.g., "type_aircraft = 'Fixed Wing Single-Engine'") [Default: False]
    :param sql_criteria:    [optional] string representing additional SQL criteria to append to a WHERE statement (e.g.,
                            'flights.id IN (104, 105, 106)' to limit results to records with those flight IDs)

    :return:                GeoPandas.GeoDataFrame instance of query results.
    '''

    if not engine:
        if not connection_txt:
            raise ValueError('You must either specify an SQLAalchemy Engine or connection_txt to connect to the database')
        engine = db_utils.connect_db(connection_txt)

    with engine.connect() as conn, conn.begin():
        query_columns = pd.Series(
            ['flights.' + c for c in db_utils.get_db_columns('flights', engine)
             if c not in ['source_file', 'time_submitted']] + \
            [f'{table}.{c}' for c in db_utils.get_db_columns(table, engine)
             if c not in ['flight_id', 'id']])
        if aircraft_info:
            query_columns = query_columns.append(pd.Series(['aircraft_info.*']))
                #['aircraft_info.' + c for c in db_utils.get_db_columns('aircraft_info', engine)])

    mask_specified = isinstance(mask, gpd.geodataframe.GeoDataFrame)
    if mask_specified:
        # Make sure the mask is in WGS84 (same as database features)
        if not mask.crs.to_epsg() == 4326:
            mask = mask.to_crs(epsg='4326')

        mask_wkt = get_mask_wkt(mask, mask_buffer_distance)
        if clip_output:
            query_columns.replace(
                {f'{table}.geom': "ST_Intersection(geom, ST_GeomFromText('%s', 4326)) AS geom" % mask_wkt},
                inplace=True)
            if table == 'flight_points':
                warnings.warn("You specified clip_output=True, but you're querying the flight_points table. This will "
                              "take much longer and yield the same result as specifying a mask with clip_output=False "
                              "(the default)")
        if bbox:
            warnings.warn('You specified both a mask_file and a bbox, but only the mask_file will be used to filter results')
    elif clip_output:
        warnings.warn('clip_output was set to True, but you did not specify a mask_file to spatially filter query results.')

    # Compose the SQL
    if start_time and end_time:
        # If the user is getting points, select points by their timestamp
        if table == 'flight_points':
            time_clause = f"flight_points.ak_datetime::time >= '{start_time}' AND " \
                          f"flight_points.ak_datetime::time <= '{end_time}'"
        # Otherwise, the user is getting lines, so the only timestamps are the departure and landing times
        else:
            time_clause = f"departure_datetime::time >= '{start_time}' AND landing_datetime::time <= '{end_time}'"
    else:
        time_clause = ''

    sql = '''SELECT {columns} FROM {table}
          INNER JOIN flights ON flights.id = {table}.flight_id 
          {aircraft_join}
          WHERE 
             {date_field}::date BETWEEN '{start_date}' AND '{end_date}' AND 
             {time_clause}
             {bbox_criteria}
             {spatial_filter}
             {other_criteria}'''\
        .format(columns=', '.join(query_columns),
                table=table,
                aircraft_join="INNER JOIN aircraft_info ON aircraft_info.registration = flights.registration" if aircraft_info else '',
                date_field="ak_datetime" if table == 'flight_points' else "departure_datetime",
                time_clause=time_clause if start_time and end_time else '',
                start_date=start_date,
                end_date=end_date,
                start_time=start_time,
                end_time=end_time,
                bbox_criteria=f" AND ST_Intersects(ST_MakeEnvelope({bbox}, 4326), geom)" if bbox else "",
                spatial_filter=f" AND ST_Intersects(geom, ST_GeomFromText('{mask_wkt}', 4326))" if mask_specified and not clip_output else '',
                other_criteria=" AND " + sql_criteria if sql_criteria else ''
                )

    with engine.connect() as conn, conn.begin():
        data = gpd.GeoDataFrame.from_postgis(sql, conn, geom_col='geom')

    # Clipping will return null geometries if other SQL criteria would have returned additional features, so remove those empty geometries
    data = data.loc[~data.geometry.is_empty]

    return data