def get_brand_sku(category_level3_id, crawl_id):
    conn = pymysql.connect(host='127.0.0.1', user='******', password='******',
                           db='customer', charset='utf8')
    #sku_jd所有的品牌-sku
    sku_all = []
    sql_all = 'select concat(brand_id, "-", sku_group) as sku\
               from sku_jd where crawl_id=(select crawl_id from sku_jd\
               order by id desc limit 1) and category_level3_id="%s"'\
               % category_level3_id #根据最新的crawl_id
    records_all = pd.read_sql(sql_all, conn)
    records_all = records_all.drop_duplicates('sku')
    for i in records_all['sku']:
        sku_all.append(i)
    #comment_count_jd已爬取的品牌-sku
    sku_crawled = []
    sql_crawled = 'select concat(brand_id, "-", sku_group) as sku\
                   from comment_count_jd where crawl_id=%s\
                   and category_level3_id="%s"' % (crawl_id, category_level3_id)
    records_crawled = pd.read_sql(sql_crawled, conn)
    records_crawled = records_crawled.drop_duplicates('sku')
    for i in records_crawled['sku']:
        sku_crawled.append(i)
    #未爬取的sku
    sku_task = [i for i in sku_all if i not in sku_crawled]
    sku_task.sort()
    n = len(sku_task)
    conn.close()
    return sku_task, n
Example #2
0
def get_labels_for_ids(ids, start_date, end_date):
    qinvest = ("SELECT newid, count(adverse_by_ourdef) from {} "
                  "WHERE dateoccured >= '{}'::date "
                  "AND dateoccured <= '{}'::date "
                  "AND newid in ({}) "
                  "group by newid "
                  ).format(config["si_table"],
                           start_date, end_date,
                           format_officer_ids(ids))

    qadverse = ("SELECT newid, count(adverse_by_ourdef) from {} "
                    "WHERE adverse_by_ourdef = 1 "
                    "AND dateoccured >= '{}'::date "
                    "AND dateoccured <= '{}'::date "
                    "AND newid in ({}) "
                    "group by newid "
                    ).format(config["si_table"],
                             start_date, end_date,
                             format_officer_ids(ids))

    invest = pd.read_sql(qinvest, con=con)
    adverse = pd.read_sql(qadverse, con=con)
    adverse["adverse_by_ourdef"] = 1
    adverse = adverse.drop(["count"], axis=1)
    invest = invest.drop(["count"], axis=1)
    outcomes = adverse.merge(invest, how='outer', on='newid')
    outcomes = outcomes.fillna(0)

    return outcomes
Example #3
0
def mf_lookup(find_str, item):
    """
    given name
    look up for managerID
    """
    cnxn_jrgcb = pyodbc.connect("""
        DRIVER={SQL Server};
        SERVER=172.16.7.166;
        DATABASE=jrgcb;
        UID=sa;
        PWD=sa123456""")
    if item == 'Name':
        sql_mf = """
        SELECT DISTINCT ManagerID
            FROM [jrgcb].[dbo].[FundAndManagerData_v2]
            WHERE [Name] = '""" + find_str + """'
        """
        return pd.read_sql(sql_mf, cnxn_jrgcb)
    if item == 'SecuCode':
        sql_mf = """
        SELECT DISTINCT ManagerID
            FROM [jrgcb].[dbo].[FundAndManagerData_v2]
            WHERE [SecuCode] = '""" + find_str + """'
        """
        return pd.read_sql(sql_mf, cnxn_jrgcb)
Example #4
0
def search_posts(phrase, engine):
    lemmatizer = WordNetLemmatizer()
    words = ["(^|[^a-z])" + lemmatizer.lemmatize(word)
                for word in word_tokenize(phrase)
                    if word not in stopwords.words('english')
                    and len(word) >= 3]

    if len(words) == 0:
        return None

    params = {'phrase': "|".join(words)}
    query = ["SELECT link_id, url, title FROM threads", 
             "WHERE title_lower ~ %(phrase)s"]
    found = pd.read_sql(" ".join(query), 
                       engine, 
                       params=params)
    
    if len(found['link_id']) == 0: 
        return None 

    link_ids = ', '.join(found['link_id'].apply(lambda lid: "'" + lid + "'"))
    query = ["SELECT clean_body as body, affil, link_id FROM cleaned", 
             "WHERE link_id IN (" + link_ids + ")"]
    data = pd.read_sql(" ".join(query), engine)
    
    valid = data[data['body'].apply(lambda text: len(text.split()) >= 10 
                                 and not bool(re.search("[^a-z]bot[^a-z]", text)))]
    
    if valid.shape[0] < 60: 
        return None
    
    return valid, found.set_index('link_id')
Example #5
0
def show_products():

    cur, conn = mysql_connect("leo_markt")
    selected_category = request.form.get("category_to_show")

    if selected_category == "":
        flash("Please select a category ", "msg")

    if (request.method == "POST") and (request.form['add'] == "show_products"):
        df = pd.read_sql(""" SELECT * FROM products where category = '%s' """ % selected_category, con = conn)
    elif (request.method == "POST") and (request.form['add'] == "refresh"):
        print("refresh pushed")
        df = pd.read_sql(""" SELECT * FROM products """, con = conn)
    else:
        df = pd.read_sql(""" SELECT * FROM products """, con = conn)

    df = df.set_index('id')

    pr_id = df.index.tolist()
    pr_names = df.name.tolist()
    # pr_descriptions = df.description.tolist()
    pr_prices = df.price.tolist()
    # change format from 1,000.00 to 1.000,00
    pr_prices = ['{:,.2f} €'.format(i).replace(",", "X").replace(".", ",").replace("X", ".") for i in pr_prices]
    pr_prices = [i.decode('utf-8') for i in pr_prices]  # euro symbol not in ASCII
    pr_alailability = df.availability.tolist()
    pr_categories = df.category.tolist()

    products_zip = zip(pr_id, pr_names, pr_prices, pr_alailability, pr_categories)
    cur.close()
    conn.close()
    return products_zip
    def __get_neighbor_movie__(self, candidate_id, viewed):
        """
        Get the most similar movie with respect to candidate_id

        :return: array of movies id(s)
        """
        current_movie = pd.read_sql("movies", self.engine).query("id_movie == " + str(candidate_id)).as_matrix()
        movies = pd.read_sql("movies", self.engine).as_matrix()
        candidate = []
        # max = 0
        for i in range(len(movies)):
            simRate = VideoBasedRecommendation.__check_similarity__(self, current_movie[0], movies[i])
            candidate.append([movies[i][0], simRate]) #id movie
            # if (simRate > max)
            #     max = simRate

        # sort
        VideoBasedRecommendation.quick_sort(self, 0, len(candidate)-1, candidate)
        # print(candidate)

        # select the best score
        out = []
        i = 0;
        prev = candidate[0][1]
        while (i < len(candidate)):
            if (prev != candidate[i][1]):
                break
            if (candidate[i][0] not in viewed):
                out.append(candidate[i][0])
            prev = candidate[i][1]
            i += 1
        random.shuffle(out)
        return out[0:5]
Example #7
0
def get_pitchab_for_pitcher(pitcher_name, con, reg=True):
    """
    Get everything from pitch and atbat for a specific pitcher, 
    merge on gameday_link + num
    usage: get_pitchab_for_pitcher(pitcher_name, con, reg=True)
    set "reg=False" to get spring training, all-star, post-season games
    """

    atbat_sql = """select * from atbat where pitcher_name = "%s" """ % pitcher_name

    pitch_sql = (
        """select * from pitch where gameday_link in 
    (select gameday_link from atbat where pitcher_name = "%s") """
        % pitcher_name
    )

    atbat = pd.read_sql(atbat_sql, con)
    pitch = pd.read_sql(pitch_sql, con)

    pitchab = pitch.merge(atbat, on=["gameday_link", "num"])
    pitchab.dropna(subset=["px"], inplace=True)

    if reg:
        game_sql = """select gameday_link from game where game_type="R" """
        reg_gdls_df = pd.read_sql(game_sql, con)
        reg_gdls = ["gid_%s" % x for x in reg_gdls_df["gameday_link"].values]
        pitchab = pitchab[pitchab["gameday_link"].isin(reg_gdls)]
    for param in ("break_angle", "break_length", "break_y"):
        pitchab[param] = pd.to_numeric(pitchab[param])
    return pitchab
def comp(database, metadata, site_info, hole_info):
    engine = create_engine(database)

    # Load metadata
    sql = "SELECT * FROM {};".format(metadata)
    metadata = pd.read_sql(sql, engine)

    # Load site data
    sql = "SELECT * FROM {};".format(site_info)
    sitedata = pd.read_sql(sql, engine)

    # Load hole data
    sql = "SELECT * FROM {};".format(hole_info)
    holedata = pd.read_sql(sql, engine)
    # Group and average hole data for sites
    hole_grouped = holedata.loc[:,('site_key', 'lat','lon','water_depth',
                                   'total_penetration','etopo1_depth',
                                   'surface_porosity', 'sed_thickness',
                                   'crustal_age','coast_distance',
                                   'ridge_distance', 'seamount',
                                   'surface_productivity','toc', 'opal',
                                   'caco3', 'woa_temp', 'woa_salinity',
                                   'woa_o2','lith1','lith2','lith3','lith4',
                                   'lith5','lith6','lith7','lith8','lith9',
                                   'lith10','lith11','lith12','lith13'
                                   )].groupby("site_key").mean().reset_index()

    # Combine all tables
    site_meta_data = pd.merge(metadata, sitedata, how='outer', on=('site_key', 'leg', 'site'))
    data = pd.merge(site_meta_data, hole_grouped, how='outer', on=('site_key')).fillna(np.nan)
    site_metadata = data.dropna(subset = ['interface_flux']).reset_index(drop=True)
    return site_metadata
def dedup_table():
  # http://stackoverflow.com/a/7745635/424631
  dbname = 'sqlite+pysqlite:////home/aahu/Dropbox/black-market-recommender-systems/data/bmrs.db'
  conn = sqlalchemy.create_engine(dbname, module=sqlite3.dbapi2)
  init_size = pd.read_sql('SELECT COUNT(*) FROM bmrs;', conn)
  logger.info('initial size: {}'.format(init_size))
  logger.info('batch scrapes together...')
  q = """
  SELECT d1.*
  FROM bmrs d1
    LEFT OUTER JOIN bmrs d2
    ON (d1.listing = d2.listing AND d1.vendor = d2.vendor AND
      d1.marketplace = d2.marketplace AND d1.category = d2.category AND
      d1.cat_tuple = d2.cat_tuple AND d1.ships_from = d2.ships_from AND
      d1.ships_to = d2.ships_to AND d1.scrape_date < d2.scrape_date)
  WHERE d2.listing IS NULL AND d2.vendor IS NULL AND
  d2.marketplace IS NULL AND d2.category IS NULL AND
  d2.cat_tuple IS NULL AND d2.ships_from IS NULL AND d2.ships_to IS NULL;
  """
  df = pd.read_sql(q, conn)
  df = df.drop_duplicates()
  print(df)
  logger.info('shape now: {}'.format(df.shape))
  logger.info('overwriting old table...')
  df.to_sql('bmrs', conn, index=False, if_exists='replace')
  return
Example #10
0
    def positions(self, algo_id=None, json=True):

        if algo_id is not None:
            algo_id = algo_id.replace('/', '')

        trades_query = "SELECT * FROM trades WHERE exit_time IS NULL"
        if algo_id is not None:
            trades_query += " AND algo='" + algo_id + "'"

        trades = pd.read_sql(trades_query, self.dbconn)

        last_query = "SELECT s.id, s.symbol, max(t.last) as last_price FROM ticks t LEFT JOIN symbols s ON (s.id=t.symbol_id) GROUP BY s.id"
        last_prices = pd.read_sql(last_query, self.dbconn)

        trades = trades.merge(last_prices, on=['symbol'])

        trades['unrealized_pnl'] = np.where(
                trades['direction']=="SHORT",
                trades['entry_price']-trades['last_price'],
                trades['last_price']-trades['entry_price'])

        trades['slippage'] = abs(trades['entry_price'] - trades['market_price'])
        trades['slippage'] = np.where(
            ((trades['direction'] == "LONG") & (trades['entry_price'] > trades['market_price'])) |
            ((trades['direction'] == "SHORT") & (trades['entry_price'] < trades['market_price'])),
            -trades['slippage'], trades['slippage'])

        trades = trades.sort_values(['entry_time'], ascending=[False])

        trades = trades.to_dict(orient="records")
        if json:
            return jsonify(trades)
        else:
            return trades
def dominant_set_topic_rank():
    #dominant_set
    conn = sqlite3.connect("zhihu.db")     
    following_data = pd.read_sql('select user_url, followee_url from Following where followee_url in (select user_url from User where agree_num > 50000) and user_url in (select user_url from User where agree_num > 50000)', conn)        
    #following_data = pd.read_sql('select user_url, followee_url from Following where followee_url in (select user_url from User where agree_num > 10000) and user_url in (select user_url from User where agree_num > 10000)', conn)        
    G = nx.DiGraph()
    for d in following_data.iterrows():
        G.add_edge(d[1][0], d[1][1])
    dominant_set = nx.dominating_set(G)
    print 'user number in dominant set:', len(dominant_set)

    #topics answered by users in dominant_set
    user_topic_data = pd.read_sql('select user_url, topic from UserTopic', conn) 
       
    topicdict = defaultdict(int)
    i = 0#counter
    for row in user_topic_data.iterrows():
        user_url = row[1][0]
        topic = row[1][1]
        if user_url in dominant_set:
            topicdict[topic] += 1
        i += 1
        #if i % 100 == 0:
            #print i
    conn.close()
    
    topicsorted = sorted(topicdict.items(), key=lambda x: x[1], reverse=True)
    
    # topic top 100
    for t in topicsorted[:100]:
        print t[0],t[1]
Example #12
0
 def _calculate_factor_value_by_pd(self, stkcode, store):
     """"""
     msg = "Processing stock {}".format(stkcode)
     self.log.info(msg)
     
     raw_cur = self.raw_conn.cursor()
     sql = """
           select Date,ClosePrice,AFloatShare,TotalShare
           from market_data_a_share2
           where StkCode='{}' and Date>='{}'
           """.format(stkcode, self.start_date)
     df_mkt = pd.read_sql(sql, self.raw_conn, index_col='Date')
     
     prc_cur = self.prc_conn.cursor()
     sql = """
           select *
           from financial_data
           where StkCode='{}' and Date>='{}'
           """.format(stkcode, self.start_date)
     df_fin = pd.read_sql(sql, self.prc_conn, index_col='Date')
     
     data = pd.concat([df_mkt,df_fin], axis=1)
     data = data.fillna(method='ffill')
     
     vals = []
     for name in self.fct_name:
         _val = eval(self.fct_algos[name])
         vals.append(_val.to_frame(name))
     result = pd.concat(vals, axis=1) 
     #store['val_'+stkcode] = result
     return result
def retrieve_best_hotels(city, state=''):
    """PURPOSE: To """
    engine = cadb.connect_aws_db(write_unicode=True)
    conn = engine.connect()
    cmd = 'SELECT * FROM yelp_reviews'

    yelp_reviews = pd.read_sql(cmd, engine)

    cmd = 'SELECT * FROM yelp_hotels'
    yelp_hotels = pd.read_sql(cmd, engine)
    yelp = pd.merge(yelp_hotels, yelp_reviews, on='business_id', how='inner')

    yelp_city = yelp[yelp['hotel_city'] == city.strip()]

    yelp_dog_review = yelp_city[yelp_city['review_text'].str.contains('dog')].copy().reset_index()

    average_dog_ratings = [np.mean(yelp_dog_review[yelp_dog_review['hotel_id'] == hotel_id]['review_rating'].values) for hotel_id in np.unique(yelp_dog_review['hotel_id'])]

    unique_hotels = yelp_dog_review[yelp_dog_review['hotel_id'].isin(np.unique(yelp_dog_review['hotel_id']))].copy()

    unique_hotels.drop_duplicates(cols='hotel_id', inplace=True)

    unique_hotels['average_rating'] = average_dog_ratings

    best_dog_hotel_names = unique_hotels.sort(columns='average_rating', ascending=False)['hotel_name'].head(10).values

    best_dog_hotel_ratings = np.round(unique_hotels.sort(columns='average_rating', ascending=False)['average_rating'].head(10).values, 1)

    string_ratings = [str(rat) for rat in best_dog_hotel_ratings]

    #print('best dog hotels:')
    #print(best_dog_hotel_names)

    return best_dog_hotel_names, string_ratings
Example #14
0
    def get_data(self):
        DB = sqlite3.connect(self.DBname)
        if self.permissionLevel == 'admin':
            results = pd.read_sql("SELECT * from results", DB)
        else:
            ID_query = "SELECT staff_code from staff where username = ?"
            staffID = pd.read_sql(ID_query,
                                  DB,
                                  params=[self.username])
            set_query = "SELECT teaching_set from staffing where staff_code = ?"
            setlist = pd.read_sql(set_query,
                                  DB,
                                  params=[staffID['staff_code'][0]])
            sets = setlist['teaching_set'].tolist()
            results_query = (
                "SELECT * from results where " +
                " or ".join(("teaching_set = " + "'" + str(n) + "' " for n in sets)))
            results = pd.read_sql(results_query, DB)
        assessments = pd.read_sql("SELECT * from assessments", DB)
        merged = pd.merge(
            results,
            assessments,
            how='left',
            left_on=['aID', 'qNum'],
            right_on=['aID', 'qNum'])
        merged = merged.drop([
            'aName_y', 'qTitle',
            'aName_x', 'course',
            'course_ID', 'module_ID'],
            axis=1)
        cols = ['aID', 'qNum', 'UPN', 'qModule', 'qTopic',
                'pMark', 'qMark', 'teaching_set']
        df = merged[cols]

        return df
Example #15
0
    def __init__(self, name, tag, thresh, category, db_path, db_lock):
        Filter.__init__(self, name, db_path, db_lock)

        self.tag = tag
        self.thresh = thresh
        self.category = category
        assert self.category == "track" or self.category == "album"

        self.lock.acquire()

        if self.category == "track":
            self.IDs = pd.read_sql(
                "SELECT ID_track.trackID \
                                  FROM tag_artist LEFT JOIN ID_tag ON ID_tag.tagID=tag_artist.tagID \
                                  LEFT JOIN ID_track ON ID_track.artistID=tag_artist.artistID\
                                  WHERE ID_tag.tagName=? AND tag_artist.count>=?",
                self.con,
                params=(self.tag, self.thresh),
            )

            self.IDs.columns = ["trackID"]

        if self.category == "album":
            self.IDs = pd.read_sql(
                "SELECT ID_album.albumID \
                                  FROM tag_artist LEFT JOIN ID_tag ON ID_tag.tagID=tag_artist.tagID \
                                  LEFT JOIN ID_album ON ID_album.artistID=tag_artist.artistID\
                                  WHERE ID_tag.tagName=? AND tag_artist.count>=?",
                self.con,
                params=(self.tag, self.thresh),
            )

            self.IDs.columns = ["albumID"]

        self.lock.release()
Example #16
0
    def __init__(
        self, name, db_path, db_lock, minim=0, maxim=time.strftime("%Y", time.localtime()), include_undated=False
    ):
        Filter.__init__(self, name, db_path, db_lock)

        self.minim = minim
        self.maxim = maxim

        if maxim and min:
            assert self.minim < self.maxim

        self.lock.acquire()
        if not include_undated:
            self.IDs = pd.read_sql(
                "SELECT albumID\
                                    FROM date_album \
                                    WHERE date_album.date_int>=? AND date_album.date_int<=?",
                self.con,
                params=(self.minim, self.maxim),
            )

        if include_undated:
            self.IDs = pd.read_sql(
                "SELECT ID_album.albumID\
                                    FROM ID_album \
                                    LEFT JOIN date_album ON date_album.albumID=ID_album.albumID \
                                    WHERE date_album.date_int>=? AND date_album.date_int<=? \
                                    OR date_album.date_int ISNULL",
                self.con,
                params=(self.minim, self.maxim),
            )

        self.lock.release()
        self.IDs.columns = ["albumID"]
Example #17
0
def load_market_cap(symbol_list, query_date = '2015-01-04', source =  "PASS / MARKET DATA", transform_to_weights = False, fix_symbols = False):
    """
    This function modifies symbol_list if this list is not sorted.
    """
    if not isinstance(query_date, str):
        if isinstance(query_date, datetime.date):
            query_date = str(query_date)
        else:
            query_date = str(query_date.date())

    if fix_symbols:
        symbol_list.sort()
        symbol_list = [symbol.replace('=', '/') for symbol in symbol_list]
    db = make_db_connection()
    query_symbols = ' OR '.join(["ST_SECURITY_CODE='%s'" % (symbol) for symbol in symbol_list])
    query_hdpks = "select ST_SECURITY_CODE, HD_PK from PASS_SYS.V_SERIE where (%s) and ST_NAME='%s'" %(query_symbols, source)
    df_hdpks = pd.read_sql(query_hdpks,db, index_col = 'ST_SECURITY_CODE')
    df_hdpks['HD_PK'] = df_hdpks['HD_PK'].apply(lambda key : key.encode('hex'))
    query_lkseries = ' OR '.join(["LK_SERIE=unhex('%s')" % (hdpk) for hdpk in df_hdpks.values.ravel().tolist()])
    query_mkt_cap = "select A.DT_DATE, B.ST_SECURITY_CODE, A.NU_CUR_MKT_CAP from PASS_SYS.V_MKTDATA as A LEFT JOIN PASS_SYS.V_SERIE as B on A.LK_SERIE=B.HD_PK where (%s) and A.DT_DATE<='%s' ORDER BY A.DT_DATE DESC LIMIT %d" % (query_lkseries, query_date, len(symbol_list)*10)
    mkt_caps = pd.read_sql(query_mkt_cap, db, index_col = 'ST_SECURITY_CODE')
    mkt_caps = mkt_caps.groupby(axis = 0, level=0).apply(lambda df: df.bfill()['NU_CUR_MKT_CAP'].values[0])#.apply(lambda df: df.bfill())
    db.close()
    mkt_caps.sort_index(inplace =True)
    if fix_symbols: #unfix them
        mkt_caps.index = symbol_list
    if transform_to_weights:
        mkt_caps = mkt_caps / mkt_caps.sum(skipna=True) 
    return mkt_caps.to_frame('NU_CUR_MKT_CAP')
def test_against_popcycle(tmpdir):
    # Generate popcycle results
    popcycledir = tmpdir.join("popcycle")
    popcycle_cmd = "Rscript tests/generate_popcycle_results.R tests {}".format(str(popcycledir))
    subprocess.check_call(popcycle_cmd.split())

    # Generate seaflowpy results
    dbfile = str(tmpdir.join("testcruise.db"))
    shutil.copyfile("tests/testcruise_paramsonly.db", dbfile)
    os.chmod(dbfile, 0664)  # make the db writeable
    evt_files = sfp.evt.find_evt_files("tests/testcruise_evt")
    filt_opts = {
        "notch1": None, "notch2": None, "offset": 0.0, "origin": None,
        "width": 0.5
    }
    sfp.filterevt.filter_evt_files(
        evt_files, "testcruise", filt_opts, dbfile, str(tmpdir.join("opp")))
    opp_files = sfp.evt.find_evt_files(str(tmpdir.join("opp")))

    # Compare opp/vct table output
    with sqlite3.connect(dbfile) as con_py:
        opp_py = pd.read_sql("SELECT * FROM opp ORDER BY file", con_py)
    with sqlite3.connect(str(popcycledir.join("testcruise.db"))) as con_R:
        opp_R = pd.read_sql("SELECT * FROM opp ORDER BY file", con_R)

    columns = ["opp_count", "evt_count", "opp_evt_ratio", "notch1", "notch2", "offset", "origin", "width"]
    npt.assert_allclose(opp_py[columns], opp_R[columns])
    assert "\n".join(opp_py["file"].values) == "\n".join(opp_R["file"].values)

    # Compare OPP file output
    opps_py = [sfp.evt.EVT(o) for o in sfp.evt.find_evt_files(str(tmpdir.join("opp")))]
    opps_R = [sfp.evt.EVT(o) for o in sfp.evt.find_evt_files(str(popcycledir.join("opp")))]
    assert len(opps_py) == len(opps_R)
    for i in range(len(opps_py)):
        npt.assert_array_equal(opps_py[i].df, opps_R[i].df)
Example #19
0
    def get_fix(self):
        """
        获取fix的DataFrame
        :return:
        :rtype: pd.DataFrame
        """
        self.open()

        part1 = pd.read_sql('select * from %s' % self.table_name_fix_part1,
                            self.connection)
        part2 = pd.read_sql('select * from %s' % self.table_name_fix_part2,
                            self.connection)
        part3 = pd.read_sql('select * from %s' % self.table_name_fix_part3,
                            self.connection)

        res_data = part1.merge(part2, how='left', left_on='index', right_on='index')
        res_data = res_data.merge(part3, how='left', left_on='index', right_on='index')

        self.close()

        # 数据的格式还有点问题, 需要fix一下
        date_list = res_data['date']
        del res_data['date']
        del res_data['index']
        res_data.index = date_list
        return res_data
def main_contents():
    print u'これはとっても素敵なシステムです'
    print u'あなたを退屈な日常から解き放ち、新たなる境地へと導いてくれます'

    yomiuri_data = pd.read_sql("SELECT * FROM item ORDER BY DateLine DESC LIMIT 10;", conn)
    
    print u'----------------------------------------------------------'
    print u'早速気になるニュースを選んでみよう\n'

    data = yomiuri_data

    for i in range(5):
        print data['HeadLine']
        print '> ',
        selected = int(sys.stdin.readline())

        print u'----------------------------------------------------------'
        print data.ix[selected, 'HeadLine'], '\n\n'
        print data.ix[selected, 'article'], '\n\n'

        print u'----------------------------------------------------------'
        print u'この記事に関連なさそうなもの', '\n'

        no_related_genre = predict_min(yomiuri_data.ix[selected, :])[0]
        print no_related_genre

        data = pd.read_sql(u"SELECT * FROM item WHERE Genre1 = '{}' ORDER BY DateLine DESC LIMIT 10;".format(no_related_genre), conn)

    print u'\n\n_人人人人人人_'
    print u'> 仕事しろ <'
    print u' ̄Y^Y^Y^Y^Y^Y^ ̄\n\n'
Example #21
0
def output():
    """
    function: calculates answer and renders results
    """

    # reads in pace and activities from options
    mpace_hr = float(request.args.get('mpace'))
    day1 = request.args.get('day1')
    day2 = request.args.get('day2')
    day3 = request.args.get('day3')
    user_pattern = get_pattern([day3,day2,day1])
  
    # loads data from database
    with db:
        cur = db.cursor()

        # list of runners
        good_runners = pd.read_sql("SELECT * FROM good_runners;",db)

        # markov transition probabilities
        prob_table = pd.read_sql("SELECT * FROM act_prob",db)

    # daily metrics for runners
    days = good_runners['days_to_marathon']
    diffs = good_runners['run_difficulty'].astype(int)
    stresses = good_runners['run_stress']
    difficulty = get_difficulty(prob_table,user_pattern)
    day_class = get_class(difficulty)

    with db:
        # get pdf and ints tables for stress and intensity
        # based on difficulty
        if difficulty == 0:
            pdf = [0]
            ints = [0]
        elif difficulty == 1:
            pdf = pd.read_sql("SELECT * FROM easy_pdf",db)['0'].tolist()
            ints = pd.read_sql("SELECT * FROM easy_ints",db)['0'].tolist()
        elif difficulty == 2:
            pdf = pd.read_sql("SELECT * FROM mod_pdf",db)['0'].tolist()
            ints = pd.read_sql("SELECT * FROM mod_ints",db)['0'].tolist()
        elif difficulty == 3:
            pdf = pd.read_sql("SELECT * FROM hard_pdf",db)['0'].tolist()
            ints = pd.read_sql("SELECT * FROM hard_ints",db)['0'].tolist()
        else:
            pdf = pd.read_sql("SELECT * FROM epic_pdf",db)['0'].tolist()
            ints = pd.read_sql("SELECT * FROM epic_ints",db)['0'].tolist() 

    # calculate stress and intensity
    stress = get_stress(pdf)
    intensity = get_intensity(ints)
  
    # information to be displayed for the athlete
    todays_run = get_today(mpace_hr,intensity,stress)
    display = get_display(difficulty,todays_run)
    dist = display[0]
    pace = display[1]

    return render_template("output.html", diff = difficulty,
                     stress = stress, dist = dist, pace = pace, day = day_class)
 def __init__(self):
     engine_statement = "mysql+pymysql://" + self.usr + ":" + self.password + "@" + self.hostname + "/" + self.dbName
     self.engine= sa.create_engine(engine_statement,)
     self.user_behaviours = pd.read_sql("user_behaviours", self.engine).as_matrix()
     self.movies = pd.read_sql("movies", self.engine)
     print("Genre Ctor")
     return
Example #23
0
def total_added():

	#connect and retreive information from databases
	PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
	DATABASE = os.path.join(PROJECT_ROOT, 'data', 'HG.db')
	conn = sqlite3.connect(DATABASE)
	c = conn.cursor()
	data= pd.read_sql('SELECT * FROM stats', conn) 
	data2=pd.read_sql('SELECT * FROM allmedia',conn)
	


	followers_added_per_day = []
	day = []
	length=len(data['Followers'])-1
	i=0
	while i<(1000):
		x = data['Followers'][length-i] - data['Followers'][length-i-48]
		followers_added_per_day.append(x)
		y="{0}-{1}".format(data['Day'][length-i-48], data['Mon'][length-i-48])
		day.append(y)
		i=i+48
	length = len(followers_added_per_day)
	return followers_added_per_day, day, length
	

	followers_past_day = data['Followers'][length] - data['Followers'][length-48]
	followers_past_week = data['Followers'][length] - data['Followers'][length-48*7]
	followers_past_month = data['Followers'][length] - data['Followers'][length-48*30]
	return followers_past_day, followers_past_week, followers_past_month
Example #24
0
def Followers_per_hour():
	#connect and retreive information from databases
	PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
	DATABASE = os.path.join(PROJECT_ROOT, 'data', 'HG.db')
	conn = sqlite3.connect(DATABASE)
	c = conn.cursor()
	data= pd.read_sql('SELECT * FROM stats ORDER BY id ASC', conn) 
	data2=pd.read_sql('SELECT * FROM allmedia',conn)

	#create the column of difference in followers per row
	Followers_per_hour = (data['Followers']-data['Followers'].shift(2)).tolist()
	length = len(Followers_per_hour)

	Followers = data['Followers'].tolist()

	Positive_Followers = len([i for i in Followers_per_hour if i>0])
	Nuetral_Followers = len([i for i in Followers_per_hour if i==0])
	Negative_Followers = len([i for i in Followers_per_hour if i<0])
	Positive_vs_Negative = (Positive_Followers, Nuetral_Followers, Negative_Followers)
	
	post_occurs = (data['Posts']-data['Posts'].shift(1)).tolist()

	Time_of_Post = (data2['Time_of_Post']).tolist()
	Likes = (data2['Likes']).tolist()
	yaxis=max(Likes)
	length_likes = len(Likes)

	conn.close()
	return Followers_per_hour, length, Followers, Positive_vs_Negative, Time_of_Post, Likes, length_likes, yaxis
Example #25
0
    def sort(self,sort_by, con, asc=True):
        #sort_by == list of strings with column names
        #asc     == corresponding list of booleans True for ascending sorting (default)

        self.lock.acquire()

        if self.category == 'track':
            df = pd.read_sql('SELECT * FROM view_temp_track', con)

        if self.category == 'album':
            df = pd.read_sql('SELECT * FROM view_temp_album', con)

        self.lock.release()

        df_sorted = df.sort_values(by=sort_by, ascending=asc)

        self.lock.acquire()

        if self.category == 'track':
            # sqlite does not have a drop table command, writing tables with pandas leaves a weird level_o column in
            # the table when kwarg if_exists='replace (apparently this stems from trying to write the index, works if index set to false)'

            df_sorted.to_sql('view_temp_track',con, if_exists='replace', index=False)

        if self.category == 'album':
            df_sorted.to_sql('view_temp_album',con, if_exists='replace', index=False)

        con.commit()
        self.lock.release()
Example #26
0
 def getGroupNormsWithZerosAsDF(self, groups=[], where='', pivot=False, sparse=False):
     """returns a dict of (group_id => feature => group_norm)"""
     """default index is on group_id and feat"""
     index=['group_id','feat']
     db_eng = mif.get_db_engine(self.corpdb)
     sql = """SELECT group_id, feat, group_norm from %s""" % (self.featureTable)
     if groups:
         gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
         if (where): sql += ' WHERE ' + where + " AND " + gCond
         else: sql += ' WHERE ' + gCond
     elif (where):
         sql += ' WHERE ' + where
     if pivot:
         if sparse:
             return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack().to_sparse().fillna(value=0)
         else:
             return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack().fillna(value=0)
     else:
         # this method won't work if default index is changed
         df =  pd.read_sql(sql=sql, con=db_eng, index_col=index)
         idx = pd.MultiIndex.from_product([df.index.levels[0], df.index.levels[1]], names=df.index.names)
         if sparse:
             return df.reindex(idx).to_sparse().fillna(value=0)
         else:
             return df.reindex(idx).fillna(value=0)
Example #27
0
def add_class_tag(df, df_neutral):

	troll_tags = []

	for name in df['name']:
		
		df_count = pd.read_sql("""
			SELECT body, name, author, score FROM May2015
			WHERE name == '{}'
			""".format(name),sql_conn)

		troll_tags.append(1)

	for name in df_neutral['name']:
		
		df_count = pd.read_sql("""
			SELECT body, name, author, score FROM May2015
			WHERE name == '{}'
			""".format(name),sql_conn)

		troll_tags.append(0)


	df_all = df.append(df_neutral, ignore_index=True)
	df_all['Class'] =  troll_tags
	df_all.to_csv('test.csv', encoding='utf-8')
	return df_all		
Example #28
0
def Followers_per_hour():
	db = DatabaseCreate(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data'), 'HG.db')
	data = pd.read_sql('SELECT * FROM stats ORDER BY id ASC', db.conn) 
	data2 = pd.read_sql('SELECT * FROM allmedia',db.conn)

	# create the column of difference in followers per row
	Followers_per_hour = (data['Followers']-data['Followers'].shift(2)).tolist()
	length = len(Followers_per_hour)

	Followers = data['Followers'].tolist()

	Positive_Followers = len([i for i in Followers_per_hour if i>0])
	Nuetral_Followers = len([i for i in Followers_per_hour if i==0])
	Negative_Followers = len([i for i in Followers_per_hour if i<0])
	Positive_vs_Negative = (Positive_Followers, Nuetral_Followers, Negative_Followers)
	
	post_occurs = (data['Posts']-data['Posts'].shift(1)).tolist()

	Time_of_Post = (data2['Time_of_Post']).tolist()
	Likes = (data2['Likes']).tolist()
	yaxis = max(Likes)
	length_likes = len(Likes)

	db.conn.close()
	return Followers_per_hour, length, Followers, Positive_vs_Negative, Time_of_Post, Likes, length_likes, yaxis
Example #29
0
def remove_outliers(measure_type):
    value_lb_dict = {'electric': 'Electric_(KWH)', 'gas':
                     'Gas_(CubicFeet)'}
    col = value_lb_dict[measure_type]
    conn = uo.connect('interval_ion')
    dfs = []
    with conn:
        df_bs = pd.read_sql('SELECT * FROM {0}_id_station'.format(measure_type), conn)
    bs_pair = zip(df_bs['Building_Number'], df_bs['ICAO'])
    # bs_pair = [x for x in bs_pair if x[0] == 'AL0039AB']
    for i, (b, s) in enumerate(bs_pair):
        print i, b
        with conn:
            df = pd.read_sql('SELECT * FROM {0} WHERE Building_Number = \'{1}\''.format(measure_type, b), conn)
        # df = df.head(n = 5000)
        # df.info()
        points = df[col]
        outliers = show_outlier(points, b, 'upper', measure_type, 1.5)
        # outliers = show_outlier(points, b, 'pos_roll', measure_type, 1.5)
        # mild, outliers = show_outlier(points, b, 'box', measure_type, 1.5)
        df['outlier'] = outliers
        print len([x for x in outliers if x])
        dfs.append(df)
    df_all = pd.concat(dfs, ignore_index=True)
    print df_all.head()
    with conn:
        df_all.to_sql('{0}_outlier_tag'.format(measure_type),
                      conn, if_exists='replace')
    return
	def get_venues(self, top_venues, num=20, exclude_recent=False, sk_artist_id=None):
		"""top_venues is a dict where the key is venue id and the value is a score. 
		num is the number of venues to return
		returns venue metadata and stores results in the venues table if they don't already exist"""
		top_venues_df = pd.DataFrame.from_dict(top_venues).reset_index()
		top_venues_df.columns = ["id","score"]
		if exclude_recent and sk_artist_id is not None:
			recent_venues = self.get_recent_venues(sk_artist_id)
			top_venues_df = top_venues_df.loc[ ~top_venues_df['id'].isin(recent_venues['id']) ,:].reset_index(drop=True)
		con = sql3.connect(db_path)
		venue_ids = []
		count=1
		venues = pd.read_sql("select * from venues",con)
		for venue_id in top_venues_df['id']:
			if venue_id not in venues['id'].tolist() and not offline_mode:
				self.store_venue(venue_id)  # store the venue data
				venues = pd.read_sql("select * from venues",con)  # reload the venues dataframe
			if venues.loc[venues['id']==venue_id,'country'].tolist()[0]=="US":
				venue_ids.append(venue_id)
				count+=1
				if count>num:
					break
			
		query = "select id, name, city, state, lat, lng, capacity from venues where country='US' and id in ({})".format(",".join(map(str,venue_ids)))
		venues = pd.read_sql(query,con)
		return venues.merge(top_venues_df).sort("score",ascending=False)
Example #31
0
def stat_index_all_no_use(tmp_datetime):
    datetime_str = (tmp_datetime).strftime("%Y-%m-%d")
    datetime_int = (tmp_datetime).strftime("%Y%m%d")
    print("datetime_str:", datetime_str)
    print("datetime_int:", datetime_int)

    # 查询今日满足股票数据。剔除数据:创业板股票数据,中小板股票数据,所有st股票
    # #`code` not like '002%' and `code` not like '300%'  and `name` not like '%st%'
    sql_1 = """ 
            SELECT `date`, `code`, `name`, `changepercent`, `trade`, `open`, `high`, `low`, 
                `settlement`, `volume`, `turnoverratio`, `amount`, `per`, `pb`, `mktcap`, `nmc` 
            FROM stock_data.ts_today_all WHERE `date` = %s and `trade` > 0 and `open` > 0 and trade <= 20 
                and `code` not like %s and `code` not like %s and `name` not like %s
            """
    print(sql_1)
    global db
    data = pd.read_sql(sql=sql_1,
                       con=db.engine,
                       params=[datetime_int, '002%', '300%', '%st%'])
    data = data.drop_duplicates(subset="code", keep="last")
    print("########data[trade]########:", len(data))
    # print(data["trade"])

    # 1), n天涨跌百分百计算
    # open price change (in percent) between today and the day before yesterday ‘r’ stands for rate.
    # stock[‘close_-2_r’]
    # 可以看到,-n天数据和今天数据的百分比。
    stock_column = ['close_-1_r', 'close_-2_r', 'code',
                    'date']  # close_-1_r  close_-2_r    code      date
    data_new = concat_guess_data(stock_column, data)

    # 2), CR指标
    # http://wiki.mbalib.com/wiki/CR%E6%8C%87%E6%A0%87 价格动量指标
    # CR跌穿a、b、c、d四条线,再由低点向上爬升160时,为短线获利的一个良机,应适当卖出股票。
    # CR跌至40以下时,是建仓良机。而CR高于300~400时,应注意适当减仓。
    stock_column = ['code', 'cr', 'cr-ma1', 'cr-ma2', 'cr-ma3',
                    'date']  # code     cr cr-ma1 cr-ma2 cr-ma3      date
    data_new = concat_guess_data(stock_column, data_new)

    # 3), KDJ指标
    # http://wiki.mbalib.com/wiki/%E9%9A%8F%E6%9C%BA%E6%8C%87%E6%A0%87
    # 随机指标(KDJ)一般是根据统计学的原理,通过一个特定的周期(常为9日、9周等)内出现过的最高价、
    # 最低价及最后一个计算周期的收盘价及这三者之间的比例关系,来计算最后一个计算周期的未成熟随机值RSV,
    # 然后根据平滑移动平均线的方法来计算K值、D值与J值,并绘成曲线图来研判股票走势。
    # (3)在使用中,常有J线的指标,即3乘以K值减2乘以D值(3K-2D=J),其目的是求出K值与D值的最大乖离程度,
    # 以领先KD值找出底部和头部。J大于100时为超买,小于10时为超卖。
    stock_column = ['code', 'date', 'kdjd', 'kdjj',
                    'kdjk']  # code      date   kdjd   kdjj   kdjk
    data_new = concat_guess_data(stock_column, data_new)

    # 4), MACD指标
    # http://wiki.mbalib.com/wiki/MACD
    # 平滑异同移动平均线(Moving Average Convergence Divergence,简称MACD指标),也称移动平均聚散指标
    # MACD 则可发挥其应有的功能,但当市场呈牛皮盘整格局,股价不上不下时,MACD买卖讯号较不明显。
    # 当用MACD作分析时,亦可运用其他的技术分析指标如短期 K,D图形作为辅助工具,而且也可对买卖讯号作双重的确认。
    stock_column = ['code', 'date', 'macd', 'macdh',
                    'macds']  # code      date   macd  macdh  macds
    data_new = concat_guess_data(stock_column, data_new)

    # 5), BOLL指标
    # http://wiki.mbalib.com/wiki/BOLL
    # 布林线指标(Bollinger Bands)
    stock_column = ['boll', 'boll_lb', 'boll_ub', 'code',
                    'date']  # boll boll_lb boll_ub    code      date
    data_new = concat_guess_data(stock_column, data_new)

    # 6), RSI指标
    # http://wiki.mbalib.com/wiki/RSI
    # 相对强弱指标(Relative Strength Index,简称RSI),也称相对强弱指数、相对力度指数
    # (2)强弱指标保持高于50表示为强势市场,反之低于50表示为弱势市场。
    # (3)强弱指标多在70与30之间波动。当六日指标上升到达80时,表示股市已有超买现象,
    # 如果一旦继续上升,超过90以上时,则表示已到严重超买的警戒区,股价已形成头部,极可能在短期内反转回转。
    stock_column = ['code', 'date', 'rsi_12',
                    'rsi_6']  # code      date rsi_12  rsi_6
    data_new = concat_guess_data(stock_column, data_new)

    # 7), W%R指标
    # http://wiki.mbalib.com/wiki/%E5%A8%81%E5%BB%89%E6%8C%87%E6%A0%87
    # 威廉指数(Williams%Rate)该指数是利用摆动点来度量市场的超买超卖现象。
    stock_column = ['code', 'date', 'wr_10',
                    'wr_6']  # code      date  wr_10   wr_6
    data_new = concat_guess_data(stock_column, data_new)

    # 8), CCI指标
    # http://wiki.mbalib.com/wiki/%E9%A1%BA%E5%8A%BF%E6%8C%87%E6%A0%87
    # 顺势指标又叫CCI指标,其英文全称为“Commodity Channel Index”,
    # 是由美国股市分析家唐纳德·蓝伯特(Donald Lambert)所创造的,是一种重点研判股价偏离度的股市分析工具。
    # 1、当CCI指标从下向上突破﹢100线而进入非常态区间时,表明股价脱离常态而进入异常波动阶段,
    # 中短线应及时买入,如果有比较大的成交量配合,买入信号则更为可靠。
    #   2、当CCI指标从上向下突破﹣100线而进入另一个非常态区间时,表明股价的盘整阶段已经结束,
    # 将进入一个比较长的寻底过程,投资者应以持币观望为主。
    # CCI, default to 14 days
    stock_column = ['cci', 'cci_20', 'code', 'date']  # cci cci_20 code date
    data_new = concat_guess_data(stock_column, data_new)

    # 9), TR、ATR指标
    # http://wiki.mbalib.com/wiki/%E5%9D%87%E5%B9%85%E6%8C%87%E6%A0%87
    # 均幅指标(Average True Ranger,ATR)
    # 均幅指标(ATR)是取一定时间周期内的股价波动幅度的移动平均值,主要用于研判买卖时机。
    stock_column = ['cci', 'cci_20', 'code', 'date']  # cci cci_20 code date
    data_new = concat_guess_data(stock_column, data_new)

    # 10), DMA指标
    # http://wiki.mbalib.com/wiki/DMA
    #  DMA指标(Different of Moving Average)又叫平行线差指标,是目前股市分析技术指标中的一种中短期指标,它常用于大盘指数和个股的研判。
    # DMA, difference of 10 and 50 moving average
    # stock[‘dma’]
    stock_column = ['code', 'date', 'dma']  # code    date       dma
    data_new = concat_guess_data(stock_column, data_new)

    # 11), DMI,+DI,-DI,DX,ADX,ADXR指标
    # http://wiki.mbalib.com/wiki/DMI
    # 动向指数Directional Movement Index,DMI)
    # http://wiki.mbalib.com/wiki/ADX
    # 平均趋向指标(Average Directional Indicator,简称ADX)
    # http://wiki.mbalib.com/wiki/%E5%B9%B3%E5%9D%87%E6%96%B9%E5%90%91%E6%8C%87%E6%95%B0%E8%AF%84%E4%BC%B0
    # 平均方向指数评估(ADXR)实际是今日ADX与前面某一日的ADX的平均值。ADXR在高位与ADX同步下滑,可以增加对ADX已经调头的尽早确认。
    # ADXR是ADX的附属产品,只能发出一种辅助和肯定的讯号,并非入市的指标,而只需同时配合动向指标(DMI)的趋势才可作出买卖策略。
    # 在应用时,应以ADX为主,ADXR为辅。
    stock_column = ['adx', 'adxr', 'code', 'date', 'dx', 'mdi',
                    'pdi']  # adx   adxr    code      date     dx    mdi    pdi
    data_new = concat_guess_data(stock_column, data_new)

    # 12), TRIX,MATRIX指标
    # http://wiki.mbalib.com/wiki/TRIX
    # TRIX指标又叫三重指数平滑移动平均指标(Triple Exponentially Smoothed Average)
    stock_column = ['code', 'date', 'trix',
                    'trix_9_sma']  # code      date    trix trix_9_sma
    data_new = concat_guess_data(stock_column, data_new)

    # 13), VR,MAVR指标
    # http://wiki.mbalib.com/wiki/%E6%88%90%E4%BA%A4%E9%87%8F%E6%AF%94%E7%8E%87
    # 成交量比率(Volumn Ratio,VR)(简称VR),是一项通过分析股价上升日成交额(或成交量,下同)与股价下降日成交额比值,
    # 从而掌握市场买卖气势的中期技术指标。
    stock_column = ['code', 'date', 'vr',
                    'vr_6_sma']  # code      date          vr    vr_6_sma
    data_new = concat_guess_data(stock_column, data_new)

    data_new = data_new.round(2)  # 数据保留2位小数

    # 删除老数据。
    del_sql = "DELETE FROM guess_indicators_daily WHERE trade_date= %(trade_date)s "
    db.execute(del_sql, params={'trade_date': datetime_int})

    # print(data_new.head())
    # data_new["down_rate"] = (data_new["trade"] - data_new["wave_mean"]) / data_new["wave_base"]
    db.insert_db(data_new, "guess_indicators_daily", True, "`date`,`code`")
Example #32
0
def user_elite_cleaned_csv():
    """Prepare data from database and save the queried table as a csv file. This version
    only include reviews / tips years between 2010-2016. Elite users before 2010 or
    after 2016 are excluded.
    """
    with sqlite3.connect(DB_PATH) as conn:
        c = conn.cursor()
        # pre-condition
        c.executescript("""
        -- create user friend summary table
        CREATE TEMP TABLE _uf AS
        SELECT user_id, COUNT(*) as friends
        FROM user_friends
        GROUP BY user_id;

        -- create user tip summary table
        CREATE TEMP TABLE _t AS
        SELECT user_id, SUM(compliment_count) AS tip_compliment, COUNT(*) AS tips, AVG(length(text)) AS tip_len
        FROM tip
        WHERE STRFTIME('%Y', date) BETWEEN '2010' AND '2016'
        GROUP BY user_id;

        -- create user review summary table
        CREATE TEMP TABLE _r AS
        SELECT user_id, COUNT(*) as review_num, AVG(length(text)) as review_len
        FROM review
        WHERE STRFTIME('%Y', date) BETWEEN '2010' AND '2016'
        GROUP BY user_id;

        -- add users having 0 friends
        INSERT INTO _uf
        SELECT user_id, 0
        FROM users
        WHERE user_id NOT IN (
            SELECT user_id
            FROM _uf
        );

        -- add users having 0 reviews
        INSERT INTO _r
        SELECT user_id, 0, 0.
        FROM users
        WHERE user_id NOT IN (
            SELECT user_id
            FROM _r
        );

        -- add users having 0 tips
        INSERT INTO _t
        SELECT user_id, 0, 0, 0.
        FROM users
        WHERE user_id NOT IN (
            SELECT user_id
            FROM _t
        );
        
        -- create exclusive user table
        CREATE TEMP TABLE _eu AS
        SELECT user_id
        FROM user_elite
        WHERE elite BETWEEN '2010' AND '2016'
        GROUP BY user_id;
        """)

        # noinspection SqlResolve
        df = pd.read_sql(
            """
        SELECT u.user_id,
            review_count,
            useful,
            cool,
            funny,
            fans,
            (compliment_hot + compliment_more + compliment_profile + compliment_cute + compliment_list + compliment_note 
            + compliment_plain + compliment_cool + compliment_funny + compliment_writer + compliment_photos) 
            AS compliment,
            friends,
            tip_compliment,
            tips,
            tip_len,
            review_num,
            review_len,
            u.user_id IN (SELECT user_id FROM user_elite) AS elite
        FROM users u,
             _uf,
             _t,
             _r
        WHERE u.user_id = _uf.user_id
        AND   u.user_id = _t.user_id
        AND   u.user_id = _r.user_id
        AND   u.user_id NOT IN _eu
        """, conn)
        df.to_csv(DATA_DIR / 'user-profiling.csv', index=False)

        # post-condition
        # noinspection SqlResolve
        c.executescript("""
        DROP TABLE _t;
        DROP TABLE _uf;
        DROP TABLE _r;
        DROP TABLE _eu;
        """)
#encoding=utf-8
import pandas as pd
import MySQLdb
from sqlalchemy import create_engine
from sqlalchemy.types import String,BLOB
mysql_cn = create_engine('mysql+mysqldb://root:bmg123@localhost:3306/bbs_pro?charset=utf8')
#mysql_cn=MySQLdb.connect(host="localhost",port=3306,user='******',passwd='bmg123',db='bbs_pro',charset='utf8')
df=pd.read_sql("select * from app01_bbs",con=mysql_cn)
#mysql_cn.close()
#mysql_cn=MySQLdb.connect(host="localhost",port=3306,user='******',passwd='bmg123',db='bbs_pro',charset='utf8')
pd.io.sql.to_sql(df,'app01_bbs_1',con=mysql_cn,if_exists='append',index=False)

Example #34
0
import matplotlib.pyplot as plt
import parameter as pa
from sklearn import cluster
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sqlalchemy import create_engine
import pymysql
import sys
reload(sys)
sys.setdefaultencoding('utf8')

engine = create_engine(
    'mysql+pymysql://root:[email protected]/demo?charset=utf8')

df = pd.read_sql('select * from scope_jianmo', engine)

l2 = [
    '批发业', '纺织业', '橡胶和塑料制品业', '商务服务业', '纺织服装、服饰业', '印刷和记录媒介复制业', '科技推广和应用服务业',
    '零售业', '专业技术服务业', '通用设备制造业', '其他制造业', '其他金融业', '非金属矿物制品业', '金属制品业',
    '皮革、毛皮、羽毛及其制品和制鞋业', '研究和试验发展', '农、林、牧、渔服务业', '居民服务业', '造纸和纸制品业', '新闻和出版业',
    '化学原料和化学制品制造业', '仪器仪表制造业', '软件和信息技术服务业', '酒、饮料和精制茶制造业', '货币金融服务', '仓储业',
    '建筑装饰和其他建筑业', '机动车、电子产品和日用产品修理业', '化学纤维制造业', '文教、工美、体育和娱乐用品制造业',
    '装卸搬运和运输代理业', '土木工程建筑业', '道路运输业', '房地产业', '食品制造业', '专用设备制造业', '电气机械和器材制造业',
    '其他服务业', '废弃资源综合利用业', '互联网和相关服务', '金属制品、机械和设备修理业', '有色金属冶炼和压延加工业', '农业',
    '住宿业', '资本市场服务', '汽车制造业', '文化艺术业', '电信、广播电视和卫星传输服务', '医药制造业', '家具制造业',
    '铁路、船舶、航空航天和其他运输设备制造业', '娱乐业', '租赁业', '体育', '木材加工和木、竹、藤、棕、草制品业', '保险业',
    '教育', '煤炭开采和洗选业', '烟草制品业', '计算机、通信和其他电子设备制造业', '非金属矿采选业',
    '广播、电视、电影和影视录音制作业', '房屋建筑业', '黑色金属冶炼和压延加工业', '水上运输业', '邮政业', '农副食品加工业',
    '建筑安装业', '生态保护和环境治理业', '餐饮业', '卫生', '黑色金属矿采选业', '铁路运输业', '电力、热力生产和供应业',
    '畜牧业', '林业', '水的生产和供应业', '公共设施管理业', '航空运输业', '渔业', '石油加工、炼焦和核燃料加工业',
def load_data():
    conn = sqlite3.connect('../data/db.sqlite')
    df = pd.read_sql('select * from disaster', conn)
    return df
Example #36
0
def load_data_from_source():
    global tripWindow_start_time, tripWindow_end_time
    global fromLaguardiaPoolsCreatedCount
    global toLaguardiaPoolsCreatedCount
    global fromLaguardiaPoolsProcessedCount
    global toLaguardiaPoolsProcesedCount

    # print("TRIPS WINDOW " + tripWindow_start_time + " " + tripWindow_end_time)
    # Get the 1st starting trip record whose pickup time is in between trip window start time and trip window end time
    trip_records_query = "select RideID, tpep_pickup_datetime ,pickup_latitude," + "pickup_longitude, dropoff_latitude," \
                                                                                   "dropoff_longitude,dist_airport from taxitrips_v2 " \
                                                                                   "where tpep_pickup_datetime " \
                                                                                   "between \"" \
                         + tripWindow_start_time + "\" and \"" + tripWindow_end_time + "\" ORDER BY tpep_pickup_datetime ASC "
    # print(trip_records_query)
    df_mysql = read_sql(trip_records_query, con=connection)

    if len(df_mysql) == 0:
        logging.info("No records exist between the given dates " +
                     tripWindow_start_time + " " + tripWindow_end_time)
        # print("No records exist between the given dates " + tripWindow_start_time + " " + tripWindow_end_time)
    else:
        # Put it all to a data frame
        tripData = df_mysql

        # Get 1st records's start date
        pool_start_date = tripData.iloc[0]['tpep_pickup_datetime']

        # Set pool end date based on pool start date and pool window
        pool_end_date = pool_start_date + timedelta(minutes=pool_window_time1)

        # print("Started Analyzing trip requests for pool windows of " + str(pool_window_time1) + " minutes")
        tripWindow_end_time = datetime.strptime(tripWindow_end_time,
                                                "%Y-%m-%d %H:%M:%S")

        while pool_end_date <= tripWindow_end_time:
            FromLaguardiaRecords = tripData.loc[
                (tripData['pickup_longitude'].between(source_longitude_min,
                                                      source_longitude_max))
                & (tripData['pickup_latitude'].between(source_latitude_min,
                                                       source_latitude_max))
                & (tripData['tpep_pickup_datetime']).between(
                    pool_start_date, pool_end_date)]
            ToLaguardiaRecords = tripData.loc[
                (tripData['dropoff_longitude'].between(source_longitude_min,
                                                       source_longitude_max))
                & (tripData['dropoff_latitude'].between(
                    source_latitude_min, source_latitude_max)) &
                (tripData['tpep_pickup_datetime']).between(
                    pool_start_date, pool_end_date)]

            # print("len is " + str(len(FromLaguardiaRecords)) + " " + str(len(ToLaguardiaRecords)))

            pick_a_ride(FromLaguardiaRecords, "From Laguardia",
                        pool_window_time1)
            pick_a_ride(ToLaguardiaRecords, "To Laguardia", pool_window_time1)

            pool_start_date = pool_end_date + timedelta(seconds=1)
            pool_end_date = pool_end_date + timedelta(
                minutes=pool_window_time1)

        if pool_start_date < tripWindow_end_time and pool_end_date > tripWindow_end_time:
            FromLaguardiaRecords = tripData.loc[
                (tripData['pickup_longitude'].between(source_longitude_min,
                                                      source_longitude_max))
                & (tripData['pickup_latitude'].between(source_latitude_min,
                                                       source_latitude_max))
                & (tripData['tpep_pickup_datetime']).between(
                    pool_start_date, tripWindow_end_time)]
            ToLaguardiaRecords = tripData.loc[
                (tripData['dropoff_longitude'].between(source_longitude_min,
                                                       source_longitude_max))
                & (tripData['dropoff_latitude'].between(
                    source_latitude_min, source_latitude_max)) &
                (tripData['tpep_pickup_datetime']).between(
                    pool_start_date, tripWindow_end_time)]
            print("len is " + str(len(FromLaguardiaRecords)) + " " +
                  str(len(ToLaguardiaRecords)))

            pick_a_ride(FromLaguardiaRecords, "From Laguardia",
                        pool_window_time1)
            pick_a_ride(ToLaguardiaRecords, "To Laguardia", pool_window_time1)

        pool_start_date = tripData.iloc[0]['tpep_pickup_datetime']
        logging.info("Starting processing for 10 minutes window")
        # Set pool end date based on pool start date and pool window
        pool_end_date = pool_start_date + timedelta(minutes=pool_window_time2)
    def get(self, request: HttpRequest, val_pks: List[int], fmt_pks: Optional[List[int]] = None,
            *args, **kwargs) -> Response:
        if fmt_pks is None:
            fmt_pks = []
        do_excel = False
        if 'report' in request.GET and request.GET['report'] == 'excel':
            do_excel = True
        grouping = request.GET.get('group-by', 'feature')
        validations = Validation.objects.filter(pk__in=val_pks)

        # Looking for best items in target validations
        ibest = Result.sa \
            .query(Result.sa.item_id, func.max(Status.sa.priority).label('best_status_priority')) \
            .filter(Result.sa.validation_id.in_(val_pks),) \
            .join(Status.sa) \
            .group_by(Result.sa.item_id).subquery('ibest')

        # looking for date of best validation
        best = Result.sa.query(ibest.c.item_id, func.max(Status.sa.id).label('best_status'),
                               func.max(Validation.sa.date).label('best_validation_date')) \
            .select_from(Result.sa) \
            .join(Status.sa, Status.sa.id == Result.sa.status_id)\
            .join(Validation.sa, Validation.sa.id == Result.sa.validation_id) \
            .join(ibest, Result.sa.item_id == ibest.c.item_id) \
            .filter(
                Result.sa.validation_id.in_(val_pks),
                Status.sa.priority == ibest.c.best_status_priority) \
            .group_by(ibest.c.item_id).subquery('best')

        v2 = Result.sa.query(
                Result.sa.item_id,
                Validation.sa.id,
                Result.sa.status_id, Validation.sa.date
            ) \
            .filter(Result.sa.validation_id.in_(val_pks),) \
            .join(Validation.sa) \
            .subquery('v2')

        # Looking for best validation in found date
        vbest = Result.sa.query(best.c.item_id, func.max(v2.c.id).label('best_validation'),
                                func.max(best.c.best_status).label('best_status_id')) \
            .select_from(best) \
            .join(v2, and_(v2.c.item_id == best.c.item_id, v2.c.status_id == best.c.best_status,
                           v2.c.date == best.c.best_validation_date)) \
            .group_by(best.c.item_id) \
            .subquery('vbest')

        # Looking for best results in found validations
        res = Result.sa.query(vbest.c.item_id, vbest.c.best_validation, vbest.c.best_status_id,
                              func.max(Result.sa.id).label('result')) \
            .select_from(vbest) \
            .join(Result.sa,
                  and_(
                      Result.sa.item_id == vbest.c.item_id,
                      Result.sa.validation_id == vbest.c.best_validation,
                      Result.sa.status_id == vbest.c.best_status_id
                    )
                ) \
            .group_by(vbest.c.item_id, vbest.c.best_validation, vbest.c.best_status_id)

        # Select scenario ids, feature names, codec names from
        # feature mapping rules which belong to selected FMTs
        fm_rules = fmt_rules(fmt_pks).subquery('fm_rules')

        # joining referenced tables to get names and so on
        res = res.subquery('res')
        q = Result.sa.query(
            (fm_rules.c.Feature if grouping == 'feature' else fm_rules.c.Codec).label('group'),
            Item.sa.name.label('item_name'), fm_rules.c.Codec,
            res.c.best_status_id, Validation.sa.name.label('val_name'),
            Driver.sa.name.label('driver_name'), Result.sa.item_id, Result.sa.validation_id,
            Validation.sa.source_file, Result.sa.result_url) \
                .select_from(res) \
                .join(Item.sa, Item.sa.id == res.c.item_id) \
                .join(Result.sa, Result.sa.id == res.c.result) \
                .join(fm_rules, Item.sa.scenario_id == fm_rules.c.scenario_id, full=True) \
                .join(Validation.sa) \
                .join(Driver.sa) \
                .order_by(fm_rules.c.Feature if grouping == 'feature' else fm_rules.c.Codec,
                        Item.sa.name)

        # Create DataFrame crosstab from SQL request
        df = pd.read_sql(q.statement, q.session.bind)
        df['group'] = df['group'].fillna('Unknown')
        if grouping == 'feature' and fmt_pks:
            # extend feature name with codec
            df = df.apply(lambda row: feature_codec_concat(row), axis=1)
        df.drop('Codec', axis=1)

        if df.empty:
            if do_excel:
                return Response()
            return Response({'headers': [], 'items': []})

        ct = pd.crosstab(index=df.group,
                         values=df.item_name, columns=df.best_status_id, aggfunc='count',
                         colnames=[''],
                         margins=True, margins_name='Total', dropna=False)

        # prepare DataFrame crosstab for response
        ct = prepare_crosstab(ct, grouping)
        # If no excel report needed just finish here with json return
        if not do_excel:
            return Response(convert_to_datatable_json(ct))

        # Excel part
        workbook = excel.do_report(data=ct, extra=validations, report_name='Best status report')

        filename = f'best_report_{datetime.now():%Y-%m-%d_%H:%M:%S}.xlsx'
        response = HttpResponse(save_virtual_workbook(workbook), content_type='application/ms-excel')
        response['Content-Disposition'] = f'attachment; filename="{filename}"'
        return response
Example #38
0
def main(args=None):
    args = parser.parse_args(args)
    config.read(args.config)

    try:
        logging.basicConfig(
            format="%(levelname)s:%(message)s", level=config["logging"]["log-level"]
        )
    except ValueError:
        logging.warning(
            f"Incorrect log-level specified: {config['logging']['log-level']}. "
            "Falling back to INFO."
        )
        logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

    try:
        realm = Realm[config["api"]["realm"]]
    except KeyError:
        logging.critical(
            f"Configured realm \"{config['api']['realm']}\" is unknown. "
            f"Choose one of: {', '.join(Realm.__members__.keys())}"
        )
        sys.exit(1)

    if not len(config["accounts"]):
        logging.warning(
            "There are no configured accounts, nothing to do. "
            'Check the "accounts" section in the config file.'
        )

    try:
        account_data = account_info(
            realm,
            config["api"]["application-id"],
            list(config["accounts"].keys()),
        ).values()
    except ValueError as e:
        logging.critical(e)
        sys.exit(1)

    flat_account_data = [
        timestamps_to_datetime(flatten(data, strip=True), keys=TIME_FIELDS)
        for data in account_data
    ]

    rows = [
        {column.name: data[column.name] for column in statistics.columns}
        for data in flat_account_data
    ]

    try:
        with sa.create_engine(config["db"]["url"]).connect() as conn:
            changed = False
            for row in rows:
                logging.info(
                    f"Attempting insert {row['nickname']} @ {row['updated_at']}"
                )
                try:
                    conn.execute(
                        statistics.insert()
                        .values(row)
                        .compile(dialect=postgresql.dialect())
                    )
                    changed = True
                    logging.info("Insert successful")
                except IntegrityError as e:
                    if not isinstance(e.orig, psycopg2.errors.UniqueViolation):
                        raise e from e
                    logging.info(f"Skipping, record exists")

            if config["plots"] and changed:
                logging.info("Change detected, updating plots")
                df = pd.read_sql(
                    "SELECT * from statistics ORDER BY updated_at",
                    conn,
                    index_col=["account_id", "updated_at"],
                )
                plt.style.use("Solarize_Light2")
                for path, interval_str in config["plots"].items():
                    if interval_str:
                        figure = create_plot(
                            df[
                                df.index.get_level_values(1)
                                > (
                                    pd.Timestamp.now(tz="UTC")
                                    - pd.Timedelta(interval_str)
                                )
                            ]
                        )
                    else:
                        figure = create_plot(df)
                    logging.info(f"Saving {path}")
                    figure.savefig(path)

    except sa.exc.OperationalError as e:
        logging.critical(f"Invalid database URL: {config['db']['url']} ({e})")
        sys.exit(1)
    except sa.exc.NoSuchModuleError:
        logging.critical(f"Invalid protocol in database URL: {config['db']['url']}")
        sys.exit(1)
Example #39
0
 def get_columns(table, db):
     return pd.read_sql(f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{db[0]}' AND TABLE_NAME = '{table[0]}';", conn).values
Example #40
0
        tilt_1 = check(start, '傾角1管理值判定')
        tilt_2 = check(start, '傾角2管理值判定')
        send(mailgo, water)
        send(mailgo, tilt_1)
        send(mailgo, tilt_2)
        
        

        time.sleep(n)






SYS_path = os.path.dirname(os.path.abspath(__file__))
STATUS_TABLE_Path = SYS_path + '/STATUS_TABLE_OUTPUT/' + 'Monitoring_Status.xlsx'

USERLIST_DB = Path(SYS_path + "/USERLIST.mdb")
conn_str = (r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};''DBQ=%s;' %(USERLIST_DB))
cnxn = pyodbc.connect(conn_str)
df = pd.read_sql("SELECT * FROM USERLIST", cnxn)
df_NOTIFICATION = df[df['資料通知'] == '開啟']
toaddr = list(df_NOTIFICATION['帳號'])
mailgo = ",".join(toaddr)





timer_reload(259200)
Example #41
0
def get_databases():
    databases = pd.read_sql("SELECT schema_name FROM information_schema.schemata WHERE schema_name not in ('information_schema','mysql','performance_schema','sys');", conn).values
    return databases
Example #42
0
 def get_tables(db):
     return pd.read_sql(f"SHOW TABLES FROM {db[0]};", conn).values
#####################
# Importar librerías
import pandas as pd 
import numpy as np
#import plotly.graph_objects as go
from datetime import datetime, timedelta
import time
from sqlalchemy import create_engine

#####################
### Fecth Raw data from AWS DB

engine = create_engine('postgresql://*****:*****@ds4a18.cmlpaj0d1yqv.us-east-2.rds.amazonaws.com:5432/Airports_ds4a')
var = pd.read_sql("SELECT count(1) from dataraw", engine.connect(), parse_dates=('valid',))
df = pd.read_sql("SELECT * from dataraw", engine.connect(), parse_dates=('valid',))
df = df.dropna(subset=['valid'],axis=0)
df['DateTime'] = df['valid'].apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S")- timedelta(hours=3))
df.replace(['M',None],np.nan,inplace=True)

print('Data Succesfully Fetched From AWS RDS',end='\n\n')


#####################
#### Select Columns With Na % lower than 22% (previously selected)

print('Deleting Variables With More Than 22% of Missing Values',end='\n\n')

cols = ['id','station','DateTime', 'valid', 'tmpf', 'dwpf', 'relh', 'drct',
       'sknt', 'p01i', 'alti', 'vsby', 'skyc1', 'skyl1', 'feel']
df = df[cols]
Example #44
0
# Connect to database:
con = sqlite3.connect("NYC-Taxi.db")

# ## Extract observations and clean data
#
# The analysis will focus on a sample of a specified number of observations of green and yellow cab data. Data from each of these sources will be stacked together, features created, and then finally a train and test set created.

# In[2]:

# need to set seed by randomly sampling indices in python then pass to sql

# specify number of rows with pull variable
pull = "5000000"
# Store sample of green and yellow cab data into dataframes:
df1 = pd.read_sql(f"SELECT * FROM green_cabs ORDER BY random() LIMIT" + " " +
                  pull,
                  con=con)
df2 = pd.read_sql(f"SELECT * from yellow_cabs ORDER BY random() LIMIT" + " " +
                  pull,
                  con=con)

# Add labels for green and yellow cabs and rename pickup/dropoff datetime columns:
df1 = df1.rename(
    columns={
        "lpep_pickup_datetime": "pickup_datetime",
        "lpep_dropoff_datetime": "dropoff_datetime"
    })
df2 = df2.rename(
    columns={
        "tpep_pickup_datetime": "pickup_datetime",
        "tpep_dropoff_datetime": "dropoff_datetime"
Example #45
0
            sDate = str(j.strftime("%Y-%m-%d"))
            eDate = str((j + dt.timedelta(days=6)).strftime("%Y-%m-%d"))

            counter += 1
            print(f"{client_name} - {counter} of {num_weeks}")
            print(f'Aggregating {sDate} - {eDate}')
            sql = f"""
                SELECT RanksDaily_{save_name}.Date, KeywordsTable.StatId, RanksDaily_{save_name}.Rank as Rank, RanksDaily_{save_name}.BaseRank as BaseRank, RankingUrl.Url as RankingUrl
                FROM RanksDaily_{save_name}
                JOIN KeywordsTable ON RanksDaily_{save_name}.KeywordId = KeywordsTable.Id
                JOIN RankingUrl ON RanksDaily_{save_name}.RankingUrlId = RankingUrl.Id
                WHERE date BETWEEN :sDate AND :eDate;
                """
            params = {"sDate":sDate, "eDate":eDate}
            df = pd.read_sql(sql, params=params, con=con)

            week_df = df.drop_duplicates(subset=["StatId"], keep="first")
            week_df = week_df.drop(["Rank","BaseRank","RankingUrl"], axis=1)
#   get average Rank
            temp =  df.groupby("StatId", as_index=False)["Rank"].mean().round().astype(int)
            week_df = week_df.merge(temp,on="StatId",how="left")
#   get average BaseRank
            temp =  df.groupby("StatId", as_index=False)["BaseRank"].mean().round().astype(int)
            week_df = week_df.merge(temp,on="StatId",how="left")
#   get top RankingUrl
            temp = df.sort_values("BaseRank", ascending=True).drop_duplicates("StatId").sort_index().reset_index(drop=True)
            week_df = pd.merge(week_df, temp[["StatId","RankingUrl"]], on="StatId", how="left")
# insert ranksweekly_df into RanksDaily_{save_name}

            print(f"Adding keywords...")
Example #46
0
def update_db(shadow_dir, station_list, dfile='shadows_data.db'):
    '''
     Update both tables: SHADOWS and STATIONS
    '''

    conn = sqlite3.connect(dfile)
    c = conn.cursor()
    today = datetime.strftime(datetime.now(), "%Y/%m/%d")
    #header :station_id,station_name,lon,lat
    new_stations = pd.read_csv(station_list, header=None)
    new_stations.columns = ['station_id', 'station_name', 'lon', 'lat']
    new_stations["Date"] = [today] * new_stations.shape[0]
    sql_com = "SELECT * FROM STATIONS"
    current_stations = pd.read_sql(sql_com, conn)
    for station in current_stations.station_id.values:
        new_stations.drop(
            new_stations.index[new_stations['station_id'] == station],
            inplace=True)
    if not new_stations.empty:
        #if it found something new, update sql data base
        print("Updating STATIONS table")
        new_stations.to_sql('STATIONS', conn, if_exists='append', index=False)
    else:
        print("No new stations added to STATIONS table")
        return
    #read the database again if it was updated
    current_stations = pd.read_sql(sql_com, conn)
    sql_com = "SELECT * FROM SHADOWS"
    shadow_old = pd.read_sql(sql_com, conn)
    #extract the info from the ifile
    from os.path import normpath, basename
    dir_info = basename(normpath(shadow_dir))
    #extract data from the name of directory
    maxdist, res, horstep, dummy = dir_info.replace("lh_", "").split("_")
    stations = []
    print("Checking for new data for SHADOWS table")
    for ifile in sorted(os.listdir(shadow_dir)):
        if ifile.startswith("lh_"):  #will probably find shadows.log here
            station = int(ifile.replace("lh_", "").split("_")[1])
            get_station = current_stations[current_stations['station_id'] ==
                                           station]
            if get_station.empty:
                print("Station %d not yet in STATIONS table" % station)
            else:
                print("Getting SHADOWS for station %d" % station)
                print("Reading shadows from %s" %
                      os.path.join(shadow_dir, ifile))
                read_shadows = pd.read_csv(os.path.join(shadow_dir, ifile),
                                           index_col=False)
                size = read_shadows.shape[0]
                az = read_shadows.azimuth.to_list()
                hor = read_shadows.horizon_height.to_list()
                lon = get_station.lon.values[0]
                lat = get_station.lat.values[0]
                station_name = get_station.station_name.values[0]
                shadow_new = pd.DataFrame({
                    "station_id": [station] * size,
                    "station_name": [station_name] * size,
                    "resolution": [res] * size,
                    "maxdistance": [maxdist] * size,
                    "horizonstep": [horstep] * size,
                    "azimuth": az,
                    "horizon_height": hor,
                    "Date": [today] * size
                })
                if shadow_old.empty:
                    shadow_new.to_sql('SHADOWS',
                                      conn,
                                      if_exists='append',
                                      index=False)
                else:
                    #drop from the new data any stations already in old data
                    for station in shadow_old.station_id.values:
                        shadow_new.drop(shadow_new.index[
                            shadow_new['station_id'] == station],
                                        inplace=True)
                    if not shadow_new.empty:
                        shadow_new.to_sql('SHADOWS',
                                          conn,
                                          if_exists='append',
                                          index=False)
                    else:
                        print("No new data added to the SHADOWS table")
    print("database updated")
    c.execute('''
    INSERT INTO DAILY_STATUS (station_id,station_name,Date) 
    SELECT DISTINCT clt.station_id, ctr.station_name, clt.Date
    FROM STATIONS clt
    LEFT JOIN SHADOWS ctr ON clt.station_id = ctr.station_id
          ''')

    c.execute('''
    SELECT DISTINCT *
    FROM DAILY_STATUS
    WHERE Date = (SELECT max(Date) FROM DAILY_STATUS)
          ''')
    df = DataFrame(c.fetchall(),
                   columns=['station_id', 'station_name', 'Date'])
    print("New data")
    print(df)
Example #47
0
    def generate_plot_data(
        self,
        metric,
        parameter,
        model_group_ids,
        train_end_times,
    ):
        """Fetch data necessary for producing the plot from the distance table

        Arguments:
            metric (string) -- model evaluation metric, such as 'precision@'
            parameter (string) -- model evaluation metric parameter,
                such as '300_abs'
            model_group_ids (list) - Model group ids to include in the dataset
            train_end_times (list) - Train end times to include in the dataset

        Returns: (pandas.DataFrame) The relevant models and the percentage of time
            each was within various thresholds of the best model at that time
        """
        model_group_union_sql = ' union all '.join([
            '(select {} as model_group_id)'.format(model_group_id)
            for model_group_id in model_group_ids
        ])
        plot_min, plot_max = self.plot_bounds(metric, parameter)
        plot_tick_dist = self.plot_tick_dist(plot_min, plot_max)
        sel_params = {
            'metric': metric,
            'parameter': parameter,
            'model_group_union_sql': model_group_union_sql,
            'distance_table': self.distance_from_best_table.distance_table,
            'model_group_str': str_in_sql(model_group_ids),
            'train_end_str': str_in_sql(train_end_times),
            'series_start': plot_min,
            'series_end': plot_max,
            'series_tick': plot_tick_dist,
        }
        sel = """\
            with model_group_ids as ({model_group_union_sql}),
            x_vals AS (
                SELECT m.model_group_id, s.distance
                FROM (SELECT GENERATE_SERIES(
                {series_start}, {series_end}, {series_tick}
                ) AS distance) s
                CROSS JOIN
                (
                SELECT DISTINCT model_group_id FROM model_group_ids
                ) m
            )
            SELECT dist.model_group_id, distance, mg.model_type,
                    COUNT(*) AS num_models,
                    AVG(CASE WHEN dist_from_best_case <= distance THEN 1 ELSE 0 END) AS pct_of_time
            FROM {distance_table} dist
            JOIN x_vals USING(model_group_id)
            JOIN model_metadata.model_groups mg using (model_group_id)
            WHERE
                dist.metric='{metric}'
                AND dist.parameter='{parameter}'
                and model_group_id in ({model_group_str})
                and train_end_time in ({train_end_str})
            GROUP BY 1,2,3
        """.format(**sel_params)

        return (pd.read_sql(sel, self.distance_from_best_table.db_engine)
                .sort_values(['model_group_id', 'distance']))
Example #48
0
def test_data(cnxn):
    query = "SELECT 作物名稱,平均價 FROM dbo.Veg WHERE 作物名稱 LIKE (N'%花椰%')"
    df = pd.read_sql(query, cnxn)
    return df
Example #49
0
 def getTable(self, tablename='agmednet_01'):
     self.connectSQL()
     df = pd.read_sql("SELECT * FROM " + tablename, self.engine)
     return df
Example #50
0
def main():

    # Load settings
    filepath_settings = 'C:/DISCHARGEDB/code/data/settings.json'
    settings = initSettings()
    saveSettings(settings, filepath_settings)
    settings = fillSettingsTags(loadSettings(filepath_settings))

    # Downlaod new images from ag mednet
    discharge = DISCHARGEDB(database=settings['database'])
    #discharge.download_images(settings)
    #discharge.update_images(settings)

    discharge.truncateTable(tablename='dicom')
    discharge.update_dicom(settings)

    ### Update agmednet reports ###
    discharge = DISCHARGEDB(host="127.0.0.1",
                            port='3306',
                            user="******",
                            password="******",
                            database=settings['database'])
    discharge.update_agmednet_01(settings)
    discharge.update_agmednet_02(settings)

    discharge = DISCHARGEDB(host="127.0.0.1",
                            port='3306',
                            user="******",
                            password="******",
                            database=settings['database'])
    rs = discharge.truncateTable('agmednet_02')
    discharge.update_agmednet_02(settings)
    df = discharge.getTable('agmednet_02')

    df = discharge.getTable('agmednet_01')

    #### Execute sript #####
    discharge = DISCHARGEDB(host="127.0.0.1",
                            port='3306',
                            user="******",
                            password="******",
                            database=settings['database'])
    table = discharge.getTable('agmednet_01')
    discharge.truncateTable('agmednet_01')
    discharge.truncateTable('agmednet_02')
    discharge.connectSQL()

    table = discharge.getTable('agmednet_01')
    table = discharge.getTable('agmednet_01')

    self = db
    mysql_path = 'mysql://' + self.user + ':' + self.password + '@' + self.host + '/' + self.database + '?charset=utf8'
    sqlEngine = create_engine(mysql_path)
    df = pd.read_sql("SELECT * FROM agmednet_01", sqlEngine)

    ### Reset autoincrement
    db = DISCHARGEDB(host="127.0.0.1",
                     port='3306',
                     user="******",
                     password="******",
                     database=settings['database'])
    db.connectSQL()
    db.resetAutoIncrement()

    #db.createDB()
    db.initDB(settings)
    db.executeScript(
        fip_script=
        'H:/cloud/cloud_data/Projects/DISCHARGEDB/src/scripts/set_primary_key.sql',
        replace=('TABLE_VAR', 'v_a06_docu_hosp'))

    result = db.executeSQL('SELECT * FROM dischargedb3.site;')
    db.sas7bdatTosql()
    db.closeSQL()

    filename = 'v_a01_fu_staff'

    db = DISCHARGEDB(database=settings['database'])
    db.connectSQL()
    db.executeSQL('ALTER TABLE ' + filename + ' ADD PRIMARY KEY index')

    command = "ALTER TABLE `dischargedb`.`v_a03_ses_staff` CHANGE COLUMN `index` `index` BIGINT NOT NULL ,"

    cursor = db.db.cursor()
    cursor.execute(command)
    result = cursor.fetchall()

    db = DISCHARGEDB(database=settings['database'])
    db.connectSQL()
    #command = "ALTER TABLE `dischargedb`.`v_a02_fu_questf_sub01` CHANGE COLUMN `index` `index` BIGINT NULL ,ADD PRIMARY KEY (`index`);;"
    command = "ALTER TABLE dischargedb.v_a03_ses_staff CHANGE COLUMN index index BIGINT NOT NULL"
    db.executeSQL(command)

    ##############################

    reader = SAS7BDAT(
        'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tmp/ecrf/v_g02_ct_reading_a.sas7bdat',
        skip_header=False)
    df1 = reader.to_data_frame()
    for i in range(len(reader.columns)):
        f = reader.columns[i].format
        print('format:', f)

    c = reader.columns[10]

    fip = 'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tmp/ecrf/v_a01_fu_staff.sas7bdat'
    df = pd.read_sas(fip, format='sas7bdat', encoding='iso-8859-1')
    df.to_sql(con=con,
              name='table_name_for_df',
              if_exists='replace',
              flavor='mysql')

    mysql_path = 'mysql://*****:*****@localhost/?charset=utf8'
    engine = create_engine(mysql_path, encoding="utf-8", echo=False)
    # with engine.connect() as con:
    # con.execute("use dischargedb3; drop table if exists " + name + ";")
    # df = pd.read_excel(path)
    # df.to_sql(name, engine, index=False)

    fip = 'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tables/sas/v_a02_fu_questf_sub01.sas7bdat'
    df = pd.read_sas(fip, format='sas7bdat', encoding='iso-8859-1')

    with engine.connect() as con:
        #con = engine.connect()
        con.execute("use dischargedb3;")
    df.to_sql('table6', engine, index=False)

    df = pd.read_excel(
        'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tables/xlsx/discharge_ecrf_01092020.xlsx',
        sheet_name='Sheet1',
        index_col=0)
Example #51
0
# In[2]:

my_db = MySQLdb.connect(host='localhost',
                        user='******',
                        passwd='yesican',
                        db='pythonBuild')
cursor = my_db.cursor()

# In[3]:

read_query = 'SELECT * FROM nifty_it_index;'

# In[4]:

nifty_it_index = pd.read_sql(read_query,
                             my_db,
                             index_col=['Date'],
                             parse_dates=True)

# In[5]:

nifty_it_index.info()

# In[6]:

nifty_it_index.head()

# In[7]:

read_query = 'SELECT * FROM infy_stock;'

# In[8]:
Example #52
0
def info_region_match():
    region_info = pd.read_sql(
        "select fund_id, region from fund_info_aggregation WHERE region IS NOT NULL AND region <> ''",
        engine_rd)
    file = codecs.open("./Scripts/DataQuality/fund_info/TestChooseAddress.js",
                       'r', "utf-8")
    region_file = file.read()
    region_info_copy = region_info
    prov = {
        '11': '北京',
        '12': '天津',
        '13': '河北',
        '14': '山西',
        '15': '内蒙古',
        '21': '辽宁',
        '22': '吉林',
        '23': '黑龙江',
        '31': '上海',
        '32': '江苏',
        '33': '浙江',
        '34': '安徽',
        '35': '福建',
        '36': '江西',
        '37': '山东',
        '41': '河南',
        '42': '湖北',
        '43': '湖南',
        '44': '广东',
        '45': '广西',
        '46': '海南',
        '50': '重庆',
        '51': '四川',
        '52': '贵州',
        '53': '云南',
        '54': '西藏',
        '61': '陕西',
        '62': '甘肃',
        '63': '青海',
        '64': '宁夏',
        '65': '新疆',
        '71': '台湾',
        '81': '香港',
        '82': '澳门',
        '90': '外国'
    }
    for i in range(len(region_info)):
        region = region_info['region'][i]
        pattern = "'(\d+)','({}.*?)'".format(region)
        try:
            match_result = re.search(pattern, region_file).groups()
            if match_result[0] in ('440300', '440303', '440304', '440305',
                                   '440306', '440307', '440308', '440391',
                                   '440392'):
                region_info_copy['region'][i] = '深圳'
                # print(region_info['fund_id'][i])
            else:
                prov_code = match_result[0][0:2]
                region_info_copy['region'][i] = prov[prov_code]
                # sql = "update easy.fund_info_aggregation set region = '{}' where fund_id = '{}'".format(
                #     region_info_copy['region'][i], region_info_copy['fund_id'][i])
                # engine_rd.execute(sql)
        except Exception as e:
            print(region_info['fund_id'][i], region)
    return region_info_copy
Example #53
0
          MIN(s.CourseID) as MathStarting        
          FROM VsaDev.dbo.StudyPlan as s        
          where s.CourseID in (select CourseID from Course as c where        
            c.CourseNumber like 'MATH%')        
          GROUP by s.GeneratedPlanID) as m1 on m1.GeneratedPlanID = g.GeneratedPlanID 
                 # find the english starting point
        full JOIN        
        (select  s.GeneratedPlanID,        
          MIN(s.CourseID) as EnglishStarting        
          FROM VsaDev.dbo.StudyPlan as s        
          where s.CourseID in (select CourseID from Course as c where c.CourseNumber like 'ENGL%')        
          GROUP by s.GeneratedPlanID) as eng on eng.GeneratedPlanID = g.GeneratedPlanID;")



vaaData = pd.read_sql(data, conn) 




vaaData.shape


# dropped the ids as they dont serve much purpose to my work.

vaaData = vaaData.drop('ParameterSetID',1)
vaaData = vaaData.drop('MajorID',1)
vaaData = vaaData.drop('SchoolID',1)
vaaData = vaaData.drop('JobTypeID',1)
vaaData = vaaData.drop('QuarterPreferenceID',1)
vaaData = vaaData.drop('StartingQuarter',1)
Example #54
0
def data():
    return pd.read_sql('passengers', DATABASE_URL).to_dict(orient="index")
Example #55
0
def get_from_db(db, table_name):
    df = pd.read_sql("SELECT * FROM {}".format(table_name), db)
    return df
Example #56
0
 def read_data(self):
     engine = create_engine(self.db_url, echo=False)
     return pd.read_sql(self.sql_query, engine, **self.pandas_kwargs_read)
Example #57
0
def getAvgSurveyResults():
    avgResults = pd.read_sql(
        "select value as Question_Num, Data_Type, Chart_Type, sum(Correct) AS numCorrect, (sum(Correct) / (COUNT(Distinct Survey_ID))) * 100 As percent_correct from survey_results.survey_results where value!='feedbk' group by value",
        conn)
    return avgResults.to_json(orient='records')
Example #58
0
"""

import sqlite3
conn = sqlite3.connect(
    r'D:\Work\DataScience\DS ppt\datascience notes\SQLDB\employee.db')
cur = conn.cursor()
cur.execute('select * from EMPLOYEE')

for rows in cur:
    print(list(rows))
#############################################################################
import sqlite3
import pandas as pd
conn = sqlite3.connect(
    r'D:\Work\DataScience\DS ppt\datascience notes\SQLDB\employee.db')
emp_set = pd.read_sql('select * from EMPLOYEE', conn)
#############################################################################
import sqlite3
conn = sqlite3.connect(
    r'D:\Work\DataScience\DS ppt\datascience notes\SQLDB\employee.db')
cur = conn.cursor()
cur.execute(
    "INSERT INTO EMPLOYEE(EMP_ID,NAME,LOCATION,SALARY) VALUES(106,'KIRAN','CHENNAI',70000)"
)
conn.commit()

#############################################################################
a = [[1, "Ashwin", "Chennai"], [2, "Raina", "Chennai"],
     [3, "Steyn", "Hydrabad"]]
b = [[2, "Raina", "Chennai"], [4, "Kohli", "Hydrabad"], [5, "Dhoni", "Pune"]]
Example #59
0
def getNewSurveyResults():
    newResults = pd.read_sql(
        "SELECT COUNT(Distinct Survey_ID) AS numberOFattempts, COUNT(Value) AS questionsAnswered, (SUM(correct) / COUNT(*)) * 100 AS pctCorrect, SUM(correct) AS numCorrect, SUM(correct != 1) as numIncorrect,  SUM(correct)/COUNT(Distinct Survey_ID) AS avgScore FROM survey_results.survey_results where value!='feedbk'",
        conn)
    return newResults.to_json(orient='records')
Example #60
0
def get_sql(sql, db_file, params=None):
    with sqlite3.connect(db_file) as db:
        return pd.read_sql(sql, db, params)