コード例 #1
0
def get_train_and_test(t_dict, max_f=100, idf=True, ngram=(1, 2)):
    stop_words = get_stop_words()
    train_keys, test_keys = train_test_split(t_dict)
    train_vec, counter, transformer = tweets_to_vectors(
        [t_dict[k] for k in train_keys],
        max_f=max_f,
        idf=idf,
        ngram=ngram,
        stop_words=stop_words)
    test_vec, counter, transformer = tweets_to_vectors(
        [t_dict[k] for k in test_keys],
        stop_words=stop_words,
        counter=counter,
        transformer=transformer)
    return train_vec, test_vec, counter
コード例 #2
0
ファイル: run.py プロジェクト: ajerneck/thatsfordinner
    most_probable_docs['ingredient_txt'] = [w for w in most_probable_docs['ingredient_txt'].str.split('\n') if w != []]
    doc_data = collections.defaultdict(list)
    for topic, g in most_probable_docs.groupby('topic'):
        row = g.sort('prob')[['ingredient_txt','image','url','title', 'key']].values
        doc_data[topic] = map(lambda x: dict(zip(['ingredient','image','url','title','key'], x)), row)
    with open('frontend/app/doc_data.pkl', 'w') as f:
        pickle.dump(doc_data, f)

    engine = p.make_engine()
    df.to_sql('clean_recipes', engine, if_exists='replace')

save_data_for_frontend(m, vectorizer, df)

## calculate and save cosine similarities for standard searching.
## prepare beforehand.
vv = CountVectorizer(
    stop_words=p.get_stop_words()
    , ngram_range=(1, 1)
    , token_pattern = '[A-Za-z]+'
)

search_cols = df['ingredient_txt_no_stopwords'].str.cat(df['title'].values, sep=' ')
vv = vv.fit(search_cols)
all_features = vv.transform(search_cols)

with open('frontend/app/search_vectorizer.pkl', 'w') as f:
    pickle.dump(vv, f)
with open('frontend/app/search_all_features.pkl', 'w') as f:
    pickle.dump(all_features, f)
PROD = "https://lab.agblox.com/api"
DEV = "http://localhost:5000/api"
TWEET_END = "/tweets"
ENDPOINT = "/predictions"
TWEET_ENDPOINT = "/tweets"
COMMODITY_ENDPOINT = "/commodities"
USER = "******"
PASS = "******"

COMMODITY = 'cattle'
MIN_KEYWORD_COUNT = 500

headers = {'content-type': 'application/json'}
category_classifier = load_classifier(COMMODITY)
stop_words = get_stop_words()


def get_tweets_from_api(start_date="2019-08-27"):
    params = {"start": start_date}
    r = requests.get(PROD + TWEET_ENDPOINT,
                     auth=HTTPBasicAuth(USER, PASS),
                     params=params)
    return json.loads(r.text)


def get_commodity_price_from_api(product_code="GF",
                                 expiration_code="U9",
                                 start_date="2019-08-25"):
    params = {
        "start": start_date,