def get_train_and_test(t_dict, max_f=100, idf=True, ngram=(1, 2)): stop_words = get_stop_words() train_keys, test_keys = train_test_split(t_dict) train_vec, counter, transformer = tweets_to_vectors( [t_dict[k] for k in train_keys], max_f=max_f, idf=idf, ngram=ngram, stop_words=stop_words) test_vec, counter, transformer = tweets_to_vectors( [t_dict[k] for k in test_keys], stop_words=stop_words, counter=counter, transformer=transformer) return train_vec, test_vec, counter
most_probable_docs['ingredient_txt'] = [w for w in most_probable_docs['ingredient_txt'].str.split('\n') if w != []] doc_data = collections.defaultdict(list) for topic, g in most_probable_docs.groupby('topic'): row = g.sort('prob')[['ingredient_txt','image','url','title', 'key']].values doc_data[topic] = map(lambda x: dict(zip(['ingredient','image','url','title','key'], x)), row) with open('frontend/app/doc_data.pkl', 'w') as f: pickle.dump(doc_data, f) engine = p.make_engine() df.to_sql('clean_recipes', engine, if_exists='replace') save_data_for_frontend(m, vectorizer, df) ## calculate and save cosine similarities for standard searching. ## prepare beforehand. vv = CountVectorizer( stop_words=p.get_stop_words() , ngram_range=(1, 1) , token_pattern = '[A-Za-z]+' ) search_cols = df['ingredient_txt_no_stopwords'].str.cat(df['title'].values, sep=' ') vv = vv.fit(search_cols) all_features = vv.transform(search_cols) with open('frontend/app/search_vectorizer.pkl', 'w') as f: pickle.dump(vv, f) with open('frontend/app/search_all_features.pkl', 'w') as f: pickle.dump(all_features, f)
PROD = "https://lab.agblox.com/api" DEV = "http://localhost:5000/api" TWEET_END = "/tweets" ENDPOINT = "/predictions" TWEET_ENDPOINT = "/tweets" COMMODITY_ENDPOINT = "/commodities" USER = "******" PASS = "******" COMMODITY = 'cattle' MIN_KEYWORD_COUNT = 500 headers = {'content-type': 'application/json'} category_classifier = load_classifier(COMMODITY) stop_words = get_stop_words() def get_tweets_from_api(start_date="2019-08-27"): params = {"start": start_date} r = requests.get(PROD + TWEET_ENDPOINT, auth=HTTPBasicAuth(USER, PASS), params=params) return json.loads(r.text) def get_commodity_price_from_api(product_code="GF", expiration_code="U9", start_date="2019-08-25"): params = { "start": start_date,