Beispiel #1
0
def build_dataset(urls, name, query, method='skip_1', entities=None):
    '''Builds the data set for a list of game URLs and a given query.
    
    Only for testing.
    The try/except clauses are here because sometimes the scraping does not
    work, because of the EPSN website. In this case, the error is often a list
    index out of range, as most of the scraping methods will break when they
    reach specific areas of the HTML source code.
    '''
    if method=='word2vec':
        model = word2vec.Word2Vec.load_word2vec_format(WORD2VEC_PATH,
                                                      binary=True)
    entities = entities or {}
    # create Dataset object
    print 'Starting to build dataset {}.'.format(name)
    dataset = Dataset(name)
    for url in urls:
        # create game objects
        try:
            g = gme.Game(url)
        except:
            continue
        try:
            answer = g.query_dict[query]
        except KeyError:
            continue # question not in dataset (e.g. who scored the 1st goal)
        # get and anonimyze text
        text = ' '.join(g.text)
        text, entities = txt.anonymize(text)
        #for i in range(len(text)):
            #text[i], entities = txt.anonymize(text[i], entities)
        inv_entities = {v: k for k, v in entities.items()}
        # fetch answer
        # create feature vector for each entity in text
        for ent_id in inv_entities.iterkeys():
            ent_name = 'ent' + str(ent_id)
            if method!='word2vec':
                feature_vector = ext.create_feature_vector(ent_name, 
                                                           text, method)
                try:
                    label = (ent_id == inv_entities[answer]) * 1.0
                except KeyError:
                    label = (inv_entities[ent_id] in answer) * 1.0               
                # add feature vector to dataset
                dataset.append((feature_vector, label), ent_name)
            else:
                feature_vector = ext.create_feature_vector(ent_name, text,
                                                           method, model=model)
                try:
                    label = (ent_id == inv_entities[answer]) * 1.0
                except KeyError:
                    label = (inv_entities[ent_id] in answer) * 1.0 
                dataset.append((dict(zip(range(len(feature_vector)), 
                                         feature_vector)), label), ent_name)
    return dataset, entities
Beispiel #2
0
def build_dataset_from_path(path, name, query, method='skip_1', entities=None):
    '''Builds the data set for a path to pickle dump and a given query.
    
    Loops over the text entities.
    '''
    if method=='word2vec':
        model = word2vec.Word2Vec.load_word2vec_format(WORD2VEC_PATH, 
                                                      binary=True)
    entities = entities or {}
    # create Dataset object
    print 'Starting to build dataset {}.'.format(name)
    dataset = Dataset(name)
    f = open(path, 'rb')
    while True:
        # create game objects
        try:
            g = pickle.load(f)
            print 'Loaded game in training set.'
        except:
            break
        try:
            answer = g.query_dict[query]
        except KeyError:
            continue # question not in dataset (e.g. who scored the 1st goal)
        # get and anonimyze text
        text = ' '.join([t.decode() for t in g.text])
        text, entities = txt.anonymize(text)
        #for i in range(len(text)):
            #text[i], entities = txt.anonymize(text[i], entities)
        inv_entities = {v: k for k, v in entities.items()}
        # fetch answer
        answer = g.query_dict[query]
        # create feature vector for each entity in text
        for ent_id in inv_entities.iterkeys():
            ent_name = 'ent' + str(ent_id)
            if method!='word2vec':
                feature_vector = ext.create_feature_vector(ent_name, 
                                                           text, method)
                try:
                    label = (ent_id == inv_entities[answer]) * 1.0
                except KeyError:
                    label = (inv_entities[ent_id] in answer) * 1.0                
                # add feature vector to dataset
                dataset.append((feature_vector, label), ent_name)
            else:
                feature_vector = ext.create_feature_vector(ent_name, text,
                                                           method, model=model)
                try:
                    label = (ent_id == inv_entities[answer]) * 1.0
                except KeyError:
                    label = (inv_entities[ent_id] in answer) * 1.0 
                dataset.append((dict(zip(range(len(feature_vector)), 
                                         feature_vector)), label), ent_name)
    f.close()
    return dataset, entities
Beispiel #3
0
def predict(name, query, testGame, model, method='skip_1'):
    '''Predicts the answer to the query and returns an array of tuples (score,
    answers), as well as the correct answer.
    
    In the tuples, answers is all the possible right answers e.g. ['Ronaldo',
    'Cristiano Ronaldo', 'Cristiano'].
    '''
    entities = {}
    # create Dataset object
    testSet = dts.Dataset.from_columns(name)    
    text = ' '.join([t.decode() for t in testGame.text])
    text, entities = txt.anonymize(text)
    #for i in range(len(text)):
        #text[i], entities = txt.anonymize(text[i], entities)
    inv_entities = {v: k for k, v in entities.items()}
    # fetch answer
    try:
        answer = testGame.query_dict[query]
    except KeyError:
        answer = 'N/A'
    # create feature vector for each entity in text
    for ent_id in inv_entities.iterkeys():
        ent_name = 'ent' + str(ent_id)
        if method!='word2vec':
            feature_vector = ext.create_feature_vector(ent_name, 
                                                       text, method)
            try:
                label = (ent_id == inv_entities[answer]) * 1.0
            except KeyError:
                label = (inv_entities[ent_id] in answer) * 1.0                
            # add feature vector to dataset
            testSet.append((feature_vector, label), ent_name)
        else:
            feature_vector = ext.create_feature_vector(ent_name, text,
                                                       method, model=model)
            try:
                label = (ent_id == inv_entities[answer]) * 1.0
            except KeyError:
                label = (inv_entities[ent_id] in answer) * 1.0 
            testSet.append((dict(zip(range(len(feature_vector)), 
                                     feature_vector)), label), ent_name)
    scores = []
    words = testSet.entities
    _, _, probas = llb.predict(testSet.Y, testSet.X, model, '-b 1')
    for i, proba in enumerate(probas):
        scores.append((proba[1], 
                       [k for k,v in entities.iteritems() \
                       if str(v) == words[i][3:]]))
    return scores, answer