def train(feature_set_name: str, model_name: str, queries_file: str, judgments_file: str, index_name: str, features_file: str, model_output: str, protected_feature_name="1", gamma=1, number_of_iterations=3000, learning_rate=0.001, lambdaa=0.001, init_var=0.01, standardize=False, log=None): """ Train and upload model with specified parameters """ es = elastic_connection(timeout=1000) collect_train_data(es, queries_file, judgments_file, feature_set_name, index_name, features_file) train_model(features_file, model_output, protected_feature_name, gamma, number_of_iterations, learning_rate, lambdaa, init_var, standardize, log) save_model(model_name, feature_set_name, model_output)
def train(): from judgments import judgments_from_file, judgments_by_qid es = elastic_connection(timeout=1000) # Load features into Elasticsearch init_default_store() load_features(FEATURE_SET_NAME) # Parse a judgments movieJudgments = judgments_by_qid( judgments_from_file(filename=JUDGMENTS_FILE)) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME) build_features_judgments_file(movieJudgments, filename=JUDGMENTS_FILE_FEATURES) # Train each ranklib model type for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: # 0, MART # 1, RankNet # 2, RankBoost # 3, AdaRank # 4, coord Ascent # 6, LambdaMART # 7, ListNET # 8, Random Forests # 9, Linear Regression Logger.logger.info("*** Training %s " % modelType) train_model(judgments_with_features_file=JUDGMENTS_FILE_FEATURES, model_output='model.txt', which_model=modelType) save_model(script_name="gsearch_model_%s" % modelType, feature_set=FEATURE_SET_NAME, model_fname='model.txt')
def index(index_name, document_dir): """ Index the data :param index_name: Name of the created index :param document_dir: Path to the directory containing the JSONs to be uploaded. Each JSON :must: have an "id". :return: """ es = elastic_connection(timeout=30) reindex(es, document_list=create_document_list(document_dir), index=index_name)
def search(index_name, query, model, verbose): """ Peforms a search request on Elasticseach using LTR and a specified (DELTR) model :param index_name: The index to search on :param query: The query to search by :param model: The model to search with :param verbose: Whether or not the output should contain the weights :return: """ es = elastic_connection(timeout=1000) results = es.search(index=index_name, body=ltr_query(query, model)) for result in results['hits']['hits']: message = result['_source']['id'] if verbose: features = result['fields']['_ltrlog'][0]['log_entry'] message += ' ' + ' '.join( ['{0}:{1}'.format(ll['name'], ll['value']) for ll in features]) Logger.logger.info(message)
def training_pipeline(): from utils import elastic_connection es = elastic_connection() file_judgments = parse_data_and_get_judgement() print(file_judgments) init_default_store() load_features(FEATURE_SET_NAME) log_features(es, judgments_dict=file_judgments, search_index=INDEX_NAME) build_features_judgments_file(file_judgments, filename=JUDGMENTS_FILE_FEATURES) for modelType in [6, 7, 9]: # for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: Logger.logger.info("*** Training %s " % modelType) train_model(judgments_with_features_file=JUDGMENTS_FILE_FEATURES, model_output='model.txt', which_model=modelType) save_model(script_name="test_%s" % modelType, feature_set=FEATURE_SET_NAME, model_fname='model.txt')
settings['mappings'] = mapping_settings # C es_connection.indices.delete(index, ignore=[400, 404]) es_connection.indices.create(index, body=settings) elasticsearch.helpers.bulk(es, bulk_docs(movie_dict, index)) def bulk_docs(movie_dict, index): print(len(movie_dict)) for movie_id, movie in movie_dict.items(): if 'release_date' in movie and movie['release_date'] == "": del movie['release_date'] enrich(movie) add_cmd = {"_index": index, # E "_type": "movie", "_id": movie_id, "_source": movie} yield add_cmd if 'title' in movie: Logger.logger.info("%s added to %s" % (movie['title'].encode('utf-8'), index)) if __name__ == "__main__": es = elastic_connection(timeout=30) tmdb_movie_dict = json.loads(open('tmdb.json',encoding='utf-8').read()) reindex(es, movie_dict=tmdb_movie_dict)
print("I am here") resp = requests.post(full_path, data=json.dumps(model_payload), headers=head, auth=ES_AUTH, verify=False) Logger.logger.info(resp.status_code) if resp.status_code >= 300: Logger.logger.error(resp.text) if __name__ == "__main__": from judgments import judgments_from_file, judgments_by_qid es = elastic_connection(timeout=1000) # Load features into Elasticsearch init_default_store() load_features(FEATURE_SET_NAME) # Parse a judgments movieJudgments = judgments_by_qid( judgments_from_file(filename=JUDGMENTS_FILE)) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME) build_features_judgments_file(movieJudgments, filename=JUDGMENTS_FILE_FEATURES) # Train each ranklib model type #for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: for modelType in [6]: # 0, MART
# Add feature back to each judgment features_per_doc = {} for doc in res['hits']['hits']: docId = doc['_id'] features = doc['fields']['_ltrlog'][0]['main'] features_per_doc[docId] = feature_dict_to_list(features) # Append features from ES back to ranklib judgment list for judgment in judgments: try: features = features_per_doc[ judgment.docId] # If KeyError, then we have a judgment but no file in index judgment.features = features except Exception as e: print(e) Logger.logger.info("Missing id %s" % judgment.docId) def build_features_judgments_file(judgments_with_features, filename): with open(filename, 'w') as judgmentFile: for qid, judgmentList in judgments_with_features.items(): for judgment in judgmentList: judgmentFile.write(judgment.to_ranklib_format() + "\n") if __name__ == "__main__": es_connection = elastic_connection() judgmentsByQid = judgments_by_qid(judgments_from_file(JUDGMENTS_FILE)) log_features(es_connection, judgmentsByQid, INDEX_NAME) build_features_judgments_file(judgmentsByQid, JUDGMENTS_FILE_FEATURES)
def generate_gsearch_judgements(): es_connection = elastic_connection() data = _get_json() _write_file(es_connection, data)
from utils import elastic_connection import pandas as pd from utils import expert_feedback_index_name es = elastic_connection() def get_data(): res = es.search(index=expert_feedback_index_name, size=20) data_df = [] for doc in res['hits']['hits']: list = [] source = doc['_source'] list.append(source["doc_id"]) list.append(source["search_query"]) list.append(source["grade"]) data_df.append(list) dataframe = pd.DataFrame(data_df, columns=['doc_id', 'search_query', 'grade']) return dataframe
if __name__ == "__main__": """This script set the default roles and users to run the LTR ranklib-demo""" if len(sys.argv) == 2: password = getpass.getpass() elif len(sys.argv) == 3: password = sys.argv[2] else: Logger.logger.info( """prepare_xpack.py [elasticsearch.user] [elasticsearch.password]""" ) sys.exit(-1) username = sys.argv[1] es = elastic_connection(http_auth=(username, password)) xpack = XPackClient(es) Logger.logger.info("Configure ltr_admin role:") res = xpack.security.put_role( 'ltr_admin', { "cluster": ["all"], "indices": [{ "names": [".ltrstore*"], "privileges": ["all"] }] }) Logger.logger.info(res) Logger.logger.info("Configure tmdb role:") res = xpack.security.put_role(