Ejemplo n.º 1
0
def main():
    # read the data:
    print("reading data...")
    trainingData, trainingLabels = readTrainingData()
    testingData, testingLabels = readTestingData()
    print("done")

    #tokanize:
    print("tokanize...")
    trainingTokens = tokanize(trainingData)
    testingTokens = tokanize(testingData)  
    #print(trainingTokens[:5])
    print("done")

    # clean data
    print("cleaning the data...")
    trainingTokens = clean(trainingTokens)
    testingTokens = clean(testingTokens)
    print("done")

    # #write vocabulary 
    print("creating vocabulary")
    f = open("../data/vocab.json", "w")
    vocab = getVocabulary(trainingTokens)
    json_vocab = json.dumps(vocab)
    f.write(json_vocab)
    print(len(vocab))
    
    
    #read vocabulary
    # print("load vocabulary...")
    # f = open("../data/vocab.json", "r")
    # with open("../data/vocab.json") as vocab_file:
    #     vocab = json.load(vocab_file)
    # print(len(vocab))
    # print("done!")

    # create vector
    print("creating bag of words")
    words_vector = getBOW(trainingTokens, vocab)
    words_vector_test = getBOW(testingTokens, vocab)
    #print(words_vector[2])
    print("done")

    #create model

    #logistic regression:
    #clf = LogisticRegression(verbose = True, random_state=0)

    #decision tree:
    #clf = tree.DecisionTreeClassifier()

    #Multy-layer Perceptron:   
    clf = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(112, 10), random_state=1)

    #print("training the model...")
    clf.fit(words_vector, trainingLabels)
    #print("done")
    score = clf.score(words_vector_test, testingLabels)
    print("score: "+ str(score))
Ejemplo n.º 2
0
def main():
    

    parser = argparse.ArgumentParser()
    parser.add_argument('letters', metavar='N', type=str, nargs='+', help='gather strings')
    args = parser.parse_args()

    question = np.array(args.letters)
    question_range = range(3, question.shape[0]+1)
    # words = reader.read('../english-words/words_dictionary.json')
    words = reader.request('https://raw.githubusercontent.com/dwyl/english-words/master/words_dictionary.json')
    if not words:
        print('Cannot Find Words')
        sys.exit(1)
    answers = []

    for i in question_range:
        
        combination = combinations(question, i)

        for combi in combination:
            clean = process.clean(combi)
            equivalents = process.words(words, clean)
            for equivalent in equivalents:
                answers.append(equivalent['words'])
    
    pprint.pprint(answers)

    return
Ejemplo n.º 3
0
def lda():
    # Models topics in the documents using LDA
    doc_clean = [clean(doc).split() for doc in doc_complete]
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    Lda = gensim.models.ldamodel.LdaModel
    myTopicModel = Lda(doc_term_matrix, id2word=dictionary, passes=50)
    numberOfModelTopics = len(
        myTopicModel.get_document_topics(doc_term_matrix))
    return numberOfModelTopics
Ejemplo n.º 4
0
from process import tokanize, clean, getVocabulary, getBOW
import json
from joblib import load
from sklearn.linear_model import LogisticRegression

print("write your review:")
review = [input()]

token = tokanize(review)
token = clean(token)
#print(token)
with open("../data/vocab.json") as vocab_file:
    vocab = json.load(vocab_file)
vector = getBOW(token, vocab)
clf = load("../models/BOW_LR_WEBSITE.joblib")
#print(vector)
prediction = clf.predict(vector)
print("your movie score is: " + str(prediction[0]))
Ejemplo n.º 5
0
import process
import time

data = 'abdqwbeujqehujqkahwsdjkwqgherjqwabndjkgqukwjehnkqwbadxukjqwghjkkbjdklbqwuikbjklhgeujkdqwbnadsxjkbquwkgedujkqwbsdjkxhqwuikhbedjkgbqwujkasgxjkqwbeukdsgq'

iter = 100000

ptr = process.setup(1234)

start_time = time.time()

for i in range(0, iter):
    process.set(ptr, data)
    value = process.get(ptr)

print(time.time() - start_time)

process.clean(1234, ptr)
Ejemplo n.º 6
0
import time
import process

data = 'abdqwbeujqehujqkahwsdjkwqgherjqwabndjkgqukwjehnkqwbadxukjqwghjkkbjdklbqwuikbjklhgeujkdqwbnadsxjkbquwkgedujkqwbsdjkxhqwuikhbedjkgbqwujkasgxjkqwbeukdsgq'

iter = 100000

start_time = time.time()

for i in range(0, iter):
    process.set("abcd", data)
    value = process.get("abcd")

print(time.time() - start_time)

process.clean("abcd")