def loadJudgments(judgFile): currJudgments = [] existingKws = set() lastQid = 0 try: currJudgments = [judg for judg in judgmentsFromFile(judgFile)] existingKws = set([judg.keywords for judg in currJudgments]) judgDict = judgmentsByQid(currJudgments) judgProfile = [] for qid, judglist in judgDict.items(): judgProfile.append((judglist[0], len(judglist))) judgProfile.sort(key=lambda j: j[1], reverse=True) for prof in judgProfile: print("%s has %s judgments" % (prof[0].keywords, prof[1])) lastQid = currJudgments[-1].qid except FileNotFoundError: pass return (currJudgments, existingKws, lastQid)
def train(self): # Load features into Elasticsearch initDefaultStore() loadFeatures(ES_FEATURE_SET_NAME) # Parse a judgments label_file = self.find_label_file() print(self.find_label_file(),file=sys.stderr) movieJudgments = judgmentsByQid( judgmentsFromFile(filename=label_file)) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" logFeatures(self.__es, judgmentsByQid=movieJudgments) buildFeaturesJudgmentsFile( movieJudgments, filename='sample_judgments_wfeatures.txt') # Train each ranklib model type # for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: modelType = int(ES_MODEL_TYPE) # 0, MART # 1, RankNet # 2, RankBoost # 3, AdaRank # 4, coord Ascent # 6, LambdaMART # 7, ListNET # 8, Random Forests # 9, Linear Regression print("*** Training %s " % modelType) self.trainModel(judgmentsWithFeaturesFile='sample_judgments_wfeatures.txt', modelOutput='model.txt', whichModel=modelType) self.saveModel(scriptName=ES_MODEL_NAME, featureSet='movie_features', modelFname='model.txt') with open('/opt/services/flaskapp/src/training_log.txt') as flog: log_lines = flog.readlines() print(label_file) return '{}{}\n{}'.format('Model trained and deployed to Elasticsearch: \n', ''.join(log_lines[-5:-3]), 'Now test the model')
parsedJson = formatFeature(ftrId, keywords) if not 'query' in parsedJson: raise ValueError( "%s.json.jinja should be an ES query with root of {\"query..." % ftrId) thisBase['query']['bool']['should'] = parsedJson['query'] yield thisBase ftrId += 1 except IOError: pass def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename): with open(filename, 'w') as judgmentFile: for qid, judgmentList in judgmentsWithFeatures.items(): for judgment in judgmentList: judgmentFile.write(judgment.toRanklibFormat() + "\n") if __name__ == "__main__": from elasticsearch import Elasticsearch from judgments import judgmentsFromFile, judgmentsByQid esUrl = "http://localhost:9200" es = Elasticsearch() judgements = judgmentsByQid( judgmentsFromFile(filename='sample_judgements.txt')) kwDocFeatures(es, index='tmdb', searchType='movie', judgements=judgements) for qid, judgmentList in judgements.items(): for judgment in judgmentList: print(judgment.toRanklibFormat())
from sys import argv from judgments import judgmentsFromFile, judgmentsByQid, duplicateJudgmentsByWeight config = configparser.ConfigParser() config.read('settings.cfg') esUrl = config['DEFAULT']['ESHost'] if len(argv) > 1: esUrl = argv[1] es = Elasticsearch(esUrl, timeout=1000) # Load features into Elasticsearch initDefaultStore(esUrl) loadFeatures(esUrl) # Parse a judgments movieJudgments = judgmentsByQid( judgmentsFromFile(filename=HUMAN_JUDGMENTS)) movieJudgments = duplicateJudgmentsByWeight(movieJudgments) trainJudgments, testJudgments = partitionJudgments(movieJudgments, testProportion=0.0) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" logFeatures(es, judgmentsByQid=movieJudgments) buildFeaturesJudgmentsFile(trainJudgments, filename=TRAIN_JUDGMENTS) buildFeaturesJudgmentsFile(testJudgments, filename=TEST_JUDGMENTS) # Train each ranklib model type for modelType in [8, 9, 6]: # 0, MART # 1, RankNet
draw = random() if draw <= testProportion: testJudgments[qid] = judgment else: trainJudgments[qid] = judgment return (trainJudgments, testJudgments) if __name__ == "__main__": from elasticsearch import Elasticsearch from judgments import judgmentsFromFile, judgmentsByQid, duplicateJudgmentsByWeight esUrl = "http://ec2-54-234-184-186.compute-1.amazonaws.com:9616/supersecretsquirrel/" es = Elasticsearch(esUrl, timeout=1000) # Parse a judgments judgments = judgmentsByQid(judgmentsFromFile(filename='osc_judgments.txt')) judgments = duplicateJudgmentsByWeight(judgments) trainJudgments, testJudgments = partitionJudgments(judgments, testProportion=0.00) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "osc_judgments_wfeatures.txt" kwDocFeatures(es, index='o19s', searchType='post', judgements=judgments) numFeatures = len(judgments[1][0].features) print("Training on %s features" % numFeatures) buildFeaturesJudgmentsFile(trainJudgments, filename='osc_judgments_wfeatures_train.txt') buildFeaturesJudgmentsFile(testJudgments, filename='osc_judgments_wfeatures_test.txt') # Train each ranklib model type for modelType in [0, 6, 9]: # 0, MART
auth=ES_AUTH) print(resp.status_code) if (resp.status_code >= 300): print(resp.text) if __name__ == "__main__": import configparser from judgments import judgmentsFromFile, judgmentsByQid es = Elasticsearch(timeout=1000) # Load features into Elasticsearch initDefaultStore() loadFeatures() # Parse a judgments movieJudgments = judgmentsByQid( judgmentsFromFile(filename='search_sample_judgments.txt')) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" logFeatures(es, judgmentsByQid=movieJudgments) buildFeaturesJudgmentsFile( movieJudgments, filename='search_sample_judgments_wfeatures.txt') # Train each ranklib model type for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: # 0, MART # 1, RankNet # 2, RankBoost # 3, AdaRank # 4, coord Ascent # 6, LambdaMART # 7, ListNET # 8, Random Forests
docId = doc['_id'] features = doc['fields']['_ltrlog'][0]['main'] featuresPerDoc[docId] = featureDictToList(features) # Append features from ES back to ranklib judgment list for judgment in judgments: try: features = featuresPerDoc[ judgment. docId] # If KeyError, then we have a judgment but no movie in index judgment.features = features except KeyError: print("Missing movie %s" % judgment.docId) def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename): with open(filename, 'w') as judgmentFile: for qid, judgmentList in judgmentsWithFeatures.items(): for judgment in judgmentList: judgmentFile.write(judgment.toRanklibFormat() + "\n") if __name__ == "__main__": from judgments import judgmentsFromFile, judgmentsByQid from elasticsearch import Elasticsearch es = Elasticsearch() judgmentsByQid = judgmentsByQid(judgmentsFromFile('sample_judgments.txt')) logFeatures(es, judgmentsByQid) buildFeaturesJudgmentsFile(judgmentsByQid, "sample_judgments_wfeatures.txt")
print(resp.text) if __name__ == "__main__": import configparser from judgments import judgmentsFromFile, judgmentsByQid es = Elasticsearch(timeout=1000) # Load features into Elasticsearch initDefaultStore() loadFeatures() # Parse a judgments movieJudgments = judgmentsByQid(judgmentsFromFile(filename='sample_judgments.txt')) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" logFeatures(es, judgmentsByQid=movieJudgments) buildFeaturesJudgmentsFile(movieJudgments, filename='sample_judgments_wfeatures.txt') # Train each ranklib model type for modelType in [0,1,2,3,4,5,6,7,8,9]: # 0, MART # 1, RankNet # 2, RankBoost # 3, AdaRank # 4, coord Ascent # 6, LambdaMART # 7, ListNET # 8, Random Forests # 9, Linear Regression
featuresPerDoc = {} for doc in res['hits']['hits']: docId = doc['_id'] features = doc['fields']['_ltrlog'][0]['main'] featuresPerDoc[docId] = featureDictToList(features) # Append features from ES back to ranklib judgment list for judgment in judgments: try: features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index judgment.features = features except KeyError: print("Missing movie %s" % judgment.docId) def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename): with open(filename, 'w') as judgmentFile: for qid, judgmentList in judgmentsWithFeatures.items(): for judgment in judgmentList: judgmentFile.write(judgment.toRanklibFormat() + "\n") if __name__ == "__main__": from judgments import judgmentsFromFile, judgmentsByQid from elasticsearch import Elasticsearch es = Elasticsearch() judgmentsByQid = judgmentsByQid(judgmentsFromFile('sample_judgments.txt')) logFeatures(es, judgmentsByQid) buildFeaturesJudgmentsFile(judgmentsByQid, "sample_judgments_wfeatures.txt")
if __name__ == "__main__": import configparser from judgments import judgmentsFromFile, judgmentsByQid judgment_filename = 'rolling500_judgments.txt' # judgment_filename = 'implicit_judgements.txt' judgment_features_filename = 'rolling500_judgments_wfeatures.txt' featureset_name = 'rolling_features_1' es = Elasticsearch(timeout=1000) # Load features into Elasticsearch initDefaultStore() loadFeatures(featureset_name) # Parse a judgments rollingJudgments = judgmentsByQid( judgmentsFromFile(filename=judgment_filename)) # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set # output as "sample_judgments_wfeatures.txt" logFeatures(es, judgmentsByQid=rollingJudgments) buildFeaturesJudgmentsFile(rollingJudgments, filename=judgment_features_filename) # Train each ranklib model type for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: # 0, MART # 1, RankNet # 2, RankBoost # 3, AdaRank # 4, coord Ascent # 6, LambdaMART # 7, ListNET # 8, Random Forests
print("REBUILDING TRAINING DATA for %s (%s/%s)" % (judgments[0].keywords, idx, len(judgmentsByQid))) # Append features from ES back to ranklib judgment list for judgment in judgments: try: features = featuresPerDoc[ judgment. docId] # If KeyError, then we have a judgment but no movie in index judgment.features = features except KeyError: print("Missing movie %s" % judgment.docId) idx += 1 def buildFeaturesJudgmentsFile(judgmentsWithFeatures, filename): with open(filename, 'w') as judgmentFile: for qid, judgmentList in judgmentsWithFeatures.items(): for judgment in judgmentList: judgmentFile.write(judgment.toRanklibFormat() + "\n") if __name__ == "__main__": from judgments import judgmentsFromFile, judgmentsByQid solrColl = SolrColl('http://localhost:8983/solr/tmdb/') judgmentsByQid = judgmentsByQid(judgmentsFromFile('movie_judgments.txt')) logFeatures(solrColl, judgmentsByQid) buildFeaturesJudgmentsFile(judgmentsByQid, "sample_judgments_wfeatures.txt")