Ejemplo n.º 1
0
Archivo: predict.py Proyecto: wacax/AI2
def prediction(fileName=None):
    #fileName = "training_set.tsv"
    #fileName = "validation_set.tsv"
    if fileName is None:
        fileName = os.path.basename(sys.argv[1])

    #Read Data
    predictionFile = pd.read_table(dataDirectory + fileName, sep='\t')
    sampleSubmission = pd.read_table(dataDirectory + "sample_submission.csv", sep=',')

    #Files compartmentalized
    questions = predictionFile["question"]

    answersA = predictionFile["answerA"]
    answersB = predictionFile["answerB"]
    answersC = predictionFile["answerC"]
    answersD = predictionFile["answerD"]

    if predictionFile.shape[1] == 7:
        #Validation answers
        correctAnswers = predictionFile["correctAnswer"]
        correctAnswersSentences = [predictionFile["answer" + str(answer)] for answer in correctAnswers][0]

    stopWords = list(set(get_stop_words('english') + stopwords.words('english') + [""]))
    #Remove necesary stopwords
    necesaryStopwords = ["itself", "all"]
    for nStop in necesaryStopwords:
        stopWords.remove(nStop)

    # Whoosh search
#    whooshIndex = open_dir("index2")
#    searchResults = questionsWhooshSearch(ix=whooshIndex, questions=questions, answersA=answersA,
#                                          answersB=answersB, answersC=answersC, answersD=answersD,
#                                          stopWordsList=stopWords)

    ##Whoosh Query
    #ix = open_dir("index2")
    #with ix.searcher() as searcher:
    #    query = QueryParser("content", ix.schema).parse(u'drinking water ver good nice')
    #    results = searcher.search(query)
    #    print results[0]

    #Load picked files corpus
    #file = open("trainingQuestionsNumeric.pkl", 'r')
    #questions = pickle.load(file)
    #file.close()

    #Load the word2vec model
    wikiWord2Vec = Word2Vec.load_word2vec_format("word2Vec.model.bin", binary=True)
    print "word2Vec model loaded"

    #Corpus vocabulary Convert it to a set, for speed. word2vec
    index2word_set = set(wikiWord2Vec.index2word)

    trainingQuestionsNumeric = questions2NumericArray(questionsVector=questions, embeddingModel=wikiWord2Vec,
                                                      modelVocab=index2word_set, stopWords4Fun=stopWords)

    if predictionFile.shape[1] == 7:
        #Dot product of the word vectors taken from (https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors)
        # Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed
        results = vectorAveraging(word2VecModel=wikiWord2Vec, questionsArray=trainingQuestionsNumeric, answersA=answersA,
                                  answersB=answersB, answersC=answersC, answersD=answersD,
                                  modelsWordSet=index2word_set, correctLetter=correctAnswers,
                                  correctAnswersSentences=correctAnswersSentences, transformationFun=characterList2Numvec,
                                  stopWordsList=stopWords)
        #Accuracy on training set
        print "accuracy on training set is " + str(float(sum(results[:, 5])) / float(trainingQuestionsNumeric.shape[0]))

        #Save .csv for meta-analysis
        np.savetxt("validationArray.csv", results, delimiter=",")

    else:
        #Dot product of the word vectors taken from (https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors)
        # Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed
        results = vectorAveraging(word2VecModel=wikiWord2Vec, questionsArray=trainingQuestionsNumeric, answersA=answersA,
                                  answersB=answersB, answersC=answersC, answersD=answersD,
                                  modelsWordSet=index2word_set, stopWordsList=stopWords, transformationFun=characterList2Numvec)

        #Remove NaN from arrays
        valResults = np.nan_to_num(results)

        predictedAnswerVal = []
        possibleAnswers = ["A", "B", "C", "D"]

        for idx in range(len(valResults)):
            #idx = 0
            #questionIdx = 2
            questionIdx = list(valResults[idx] == max(valResults[idx]))
            predictedAnswerVal.append(possibleAnswers[questionIdx.index(True)])

        #Write a submission file
        sampleSubmission["correctAnswer"] = predictedAnswerVal
        sampleSubmission.to_csv(submissionDirectory + "test.csv", sep=",", index=False)

    #Return results for further analysis
    return results
Ejemplo n.º 2
0
Archivo: AI2Main.py Proyecto: wacax/AI2
#    wikiDoc2Vec.train(wikipedia8thGradeTaggedDocument)
#    wikiDoc2Vec.alpha -= 0.002  # decrease the learning rate
#    wikiDoc2Vec.min_alpha = wikiDoc2Vec.alpha  # fix the learning rate, no decay
#    print "epoch number " + str(epoch + 1) + " finished"
#print "training DONE!"

#Corpus vocabulary Convert it to a set, for speed. word2vec
index2word_set = set(wikiWord2Vec.index2word)
#Corpus vocabulary Convert it to a set, for speed, Doc2Vec
#index2Doc2VecWord_set = set(wikiDoc2Vec.index2word)

#Dot product of the word vectors taken from (https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors)
# Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed
trainingResults = vectorAveraging(word2VecModel=wikiWord2Vec, questions=trainQuestions, answersA=answersA,
                                  answersB=answersB, answersC=answersC, answersD=answersD,
                                  modelsWordSet=index2word_set, correctLetter=correctAnswers,
                                  correctAnswersSentences=correctAnswersSentences, transformationFun=characterList2Numvec,
                                  stopWordsList=stopWords)

#Accuracy on training set
print "accuracy on training set is " + str(float(sum(trainingResults[:, 5])) / float(len(trainQuestions)))

#Doc2Vec native distance
# Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed
#trainingResultsDoc2Vec = vectorAveraging(word2VecModel=wikiDoc2Vec, questions=trainQuestions, answersA=answersA,
#                                         answersB=answersB, answersC=answersC, answersD=answersD,
#                                         modelsWordSet=index2Doc2VecWord_set, correctLetter=correctAnswers,
#                                         correctAnswersSentences=correctAnswersSentences, transformationFun=list2List,
#                                         stopWordsList=stopWords)

#Accuracy on training set