def build_dataset(urls, name, query, method='skip_1', entities=None): '''Builds the data set for a list of game URLs and a given query. Only for testing. The try/except clauses are here because sometimes the scraping does not work, because of the EPSN website. In this case, the error is often a list index out of range, as most of the scraping methods will break when they reach specific areas of the HTML source code. ''' if method=='word2vec': model = word2vec.Word2Vec.load_word2vec_format(WORD2VEC_PATH, binary=True) entities = entities or {} # create Dataset object print 'Starting to build dataset {}.'.format(name) dataset = Dataset(name) for url in urls: # create game objects try: g = gme.Game(url) except: continue try: answer = g.query_dict[query] except KeyError: continue # question not in dataset (e.g. who scored the 1st goal) # get and anonimyze text text = ' '.join(g.text) text, entities = txt.anonymize(text) #for i in range(len(text)): #text[i], entities = txt.anonymize(text[i], entities) inv_entities = {v: k for k, v in entities.items()} # fetch answer # create feature vector for each entity in text for ent_id in inv_entities.iterkeys(): ent_name = 'ent' + str(ent_id) if method!='word2vec': feature_vector = ext.create_feature_vector(ent_name, text, method) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 # add feature vector to dataset dataset.append((feature_vector, label), ent_name) else: feature_vector = ext.create_feature_vector(ent_name, text, method, model=model) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 dataset.append((dict(zip(range(len(feature_vector)), feature_vector)), label), ent_name) return dataset, entities
def build_dataset_from_path(path, name, query, method='skip_1', entities=None): '''Builds the data set for a path to pickle dump and a given query. Loops over the text entities. ''' if method=='word2vec': model = word2vec.Word2Vec.load_word2vec_format(WORD2VEC_PATH, binary=True) entities = entities or {} # create Dataset object print 'Starting to build dataset {}.'.format(name) dataset = Dataset(name) f = open(path, 'rb') while True: # create game objects try: g = pickle.load(f) print 'Loaded game in training set.' except: break try: answer = g.query_dict[query] except KeyError: continue # question not in dataset (e.g. who scored the 1st goal) # get and anonimyze text text = ' '.join([t.decode() for t in g.text]) text, entities = txt.anonymize(text) #for i in range(len(text)): #text[i], entities = txt.anonymize(text[i], entities) inv_entities = {v: k for k, v in entities.items()} # fetch answer answer = g.query_dict[query] # create feature vector for each entity in text for ent_id in inv_entities.iterkeys(): ent_name = 'ent' + str(ent_id) if method!='word2vec': feature_vector = ext.create_feature_vector(ent_name, text, method) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 # add feature vector to dataset dataset.append((feature_vector, label), ent_name) else: feature_vector = ext.create_feature_vector(ent_name, text, method, model=model) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 dataset.append((dict(zip(range(len(feature_vector)), feature_vector)), label), ent_name) f.close() return dataset, entities
def predict(name, query, testGame, model, method='skip_1'): '''Predicts the answer to the query and returns an array of tuples (score, answers), as well as the correct answer. In the tuples, answers is all the possible right answers e.g. ['Ronaldo', 'Cristiano Ronaldo', 'Cristiano']. ''' entities = {} # create Dataset object testSet = dts.Dataset.from_columns(name) text = ' '.join([t.decode() for t in testGame.text]) text, entities = txt.anonymize(text) #for i in range(len(text)): #text[i], entities = txt.anonymize(text[i], entities) inv_entities = {v: k for k, v in entities.items()} # fetch answer try: answer = testGame.query_dict[query] except KeyError: answer = 'N/A' # create feature vector for each entity in text for ent_id in inv_entities.iterkeys(): ent_name = 'ent' + str(ent_id) if method!='word2vec': feature_vector = ext.create_feature_vector(ent_name, text, method) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 # add feature vector to dataset testSet.append((feature_vector, label), ent_name) else: feature_vector = ext.create_feature_vector(ent_name, text, method, model=model) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 testSet.append((dict(zip(range(len(feature_vector)), feature_vector)), label), ent_name) scores = [] words = testSet.entities _, _, probas = llb.predict(testSet.Y, testSet.X, model, '-b 1') for i, proba in enumerate(probas): scores.append((proba[1], [k for k,v in entities.iteritems() \ if str(v) == words[i][3:]])) return scores, answer