Ejemplo n.º 1
0
    x5 = ' '.join(x4)  
    train_2.append([x5, thingy[1]])

    
test_2 = []
for thingy in test_1:
    no_url = re.sub(r'^https?:\/\/.*[\r\n]*', '', thingy[0], flags=re.MULTILINE)
    x0 = word_tokenize(no_url)
    x1 = [w.translate(table_punctuation) for w in x0]  
    x2 = [word.lower() for word in x1]
    x3 = [check(word) for word in x2]
    x4 = list(filter(None, x3))  
    x5 = ' '.join(x4)  
    test_2.append([x5, thingy[1]])

cl_custom = NaiveBayesClassifier(train_1)

cl_2 = NaiveBayesClassifier(train_2)

Doc_set_train_1 = []
for thingy in train_1[:10]:
    no_url = re.sub(r'^https?:\/\/.*[\r\n]*', '', thingy[0][0], flags=re.MULTILINE)
    x0 = word_tokenize(no_url)
    x1 = [w.translate(table_punctuation) for w in x0]  
    x2 = [word.lower() for word in x1]
    x3 = [check(word) for word in x2]
    x4 = list(filter(None, x3))  
    x5 = ' '.join(x4)  
    Doc_set_train_1.append([x5, thingy[0][1]])

cl_Doc_set = NaiveBayesClassifier(Doc_set_train_1)
Ejemplo n.º 2
0
trainPosts = posts[:trainSetCount]
trainSet = [
    (re.sub(re.compile('<.*?>'), '', post.get("Body")).replace("\n", ""),
     post.get("Tags").replace("<", "").split(">")[0:len(post.get("Tags")) - 1])
    for post in list(
        filter(lambda post: post.get("Tags") is not None, trainPosts))
]

classifierInput = []
for post in trainSet:
    body = post[0]
    for tag in post[1]:
        classifierInput.append((body, tag))

#trains classifier
classifier = NaiveBayesClassifier(classifierInput)

#Build test set
testSet = posts[trainSetCount:trainSetCount + testPartCount]

#Perform classification
for post in testSet:
    classificationResults = classifier.prob_classify(
        re.sub(re.compile('<.*?>'), '', post.get("Body")).replace("\n", ""))
    print("\nPost: " + str(classificationResults))
    print("Best matching tags: ")
    tagsWithRanks = {}
    for tag in classificationResults.samples():
        tagsWithRanks[tag] = classificationResults.prob(tag)

    tagsWithRanks = sorted(tagsWithRanks.items(),
Ejemplo n.º 3
0
df_xml = pd.DataFrame(xml_data, columns=dfcols)
# print(df_xml)


dfcolsa = ['answer_id', 'group', 'isElectedAnswer', 'text']
answers = root.findall('.//answer')
xml_data_ans = [[answer.get(dfcolsa[0]), answer.get(dfcolsa[1]), answer.get(dfcolsa[2]), answer.get(dfcolsa[3])] for answer in answers]
df_xml_ans = pd.DataFrame(xml_data_ans, columns=dfcolsa)
# print(df_xml_ans)

# textblob
train = [(getattr(row, 'title'), getattr(row, 'question_id')) for row in df_xml.itertuples()]
from textblob.classifiers import NaiveBayesClassifier
#cl = NaiveBayesClassifier(train)
smalltrain = train[0:100] # about 7 min for 10k but more than 10 min when using classifier
cl = NaiveBayesClassifier(smalltrain)



# comparing with questions
nlpl = spacy.load('en_core_web_lg')
test = nlpl('How can i increase usb voltage?') # similar to #28
train1 = [(getattr(row, 'title'), row.Index) for row in df_xml.itertuples()]
smalltrain1 = train1[0:100]
results = []
for sentence in smalltrain1:
    doc0 = nlpl(sentence[0])
    results.append((test.similarity(doc0), sentence[1]))
# sorted(results, key=lambda res: res[0], reverse=True)
sorted(results, key=lambda res: res[0], reverse=True)[0:5] # top 5
Ejemplo n.º 4
0
    for line in inputfile:
        neg_neutral.append(line.rstrip('\n'))
#pprint (neg_neutral)

ing_neutral = []
with open(
        'C:\\Users\\hisg316\\Desktop\\Htweetprod2\\Htweets2\\ing_neutral.txt',
        'r') as inputfile1:
    for line2 in inputfile1:
        ing_neutral.append(line2.rstrip('\n'))
#pprint (ing_neutral)

train = critical_train + critical_train2

#passing training data into the constructor
cl = NaiveBayesClassifier(critical_train)
cl2 = NaiveBayesClassifier(train)


# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
    def __init__(self):
        self.tweet_data = []
        self.just_text = []
        # self.counter = 0

    def on_data(self, data):
        # pprint (data)
        # saveFile = io.open('tweet_raw.json', 'a', encoding='utf-8')co
        # thetweets = json.loads(data)
        print(json.loads(data))
Ejemplo n.º 5
0
def trainClassifier():
    generic_questions = (
        "Let's go",
        "You never wanted to go out with 'me, did you?",
        "Who knows?",
        "What annoys you?",
        "you've heard of him?",
        "What were you doing?",
        "Thank you anyway",
        "No problem",
        'She okay?',
        "Yes, I have a question.",
        "What is your question?",
        "What are your hobbies?",
        "You know how sometimes you just become this 'persona'?  And you don't know how to quit?",
        "what's up?",
        'sup people? I see the weather\'s getting better over there, Ben.',
        "how are you doing?",
        "Hi",
        "Hello",
        "Hey",
        "How's you?",
        "Have you heard the news?",
        'i had the same problem your having so thats my i made my own.',
        "What is your favorite book?",
        "good night",
        "good morning",
        "good afternoon",
        "good evening",
        "So what's your favorite color?",
        'What good stuff?',
        "what's new?",
        "How's life?",
        "That is good to hear",
        "I am doing well, how about you?",
        "I am doing well, how about you?",
        "I'm also good.",
        "What are you then?",
        'What are you working on?',
        "Who are you?",
        "What is it like?",
        "How do you work?",
        "Who is your appointment with?",
        "What languages do you like to use?",
    )

    technical_questions = (
        "Clearpass is extended to IT systems using which API?",
        "Which browsers are supported for ClearPass?",
        "Which  virtualization platforms  is supported by Clearpass?",
        "name the authentication/authorization sources used by clearpass.",
        "does Clearpass use ipv6 or ipv4 addressing?",
        "how many sessioons can  be provided by ClearPass C2000 Hardware Appliance?",
        "how does Admin/Operator access security?",
        'Virtual Appliances are supported on which platforms?',
        "Name the ClearPass Hardware Appliance Ports.",
        "What is the expansion of OCSP?",
        "what are the active Profiling Methods?", "What are cookies?",
        "what does dynamic authorisation mean?",
        "Which standard the clearpass Guest is built on?",
        "which protocol is used by the  NAS  to authenticate the user ?",
        "Which network connectivity is provisioned for Clearpass Guest?",
        "What is NAS?", "What are the possible states of a session?",
        "what does dynamic authorisation mean?",
        'Which standard the clearpass Guest is built on?',
        "Which network connectivity is provisioned for Clearpass Guest?",
        "What is the use of airgroup?", "What are cookies used for?",
        'Is Windows Server 2008 "Server Core" appropriate for a SQL Server instance?',
        "Is there any list of the network devices supported by clearpass for 802.1x auth",
        "How can I Block my users from installing new virtual machines",
        "Is there any list of medical devices compatible with clearpass ?",
        "what are Good branching and merging tutorials for TortoiseSVN?",
        "how to Add scripting functionality to .NET applications",
        "why is VMWare Server Under Linux Secondary NIC connection",
        "Setting up Continuous Integration with SVN",
        "Does CruiseControl.NET run on IIS 7.0?",
        "what to do  when there are users in both Edmonton and Toronto that access the same “Corpnet” Wireless LAN.",
        "what are the  three hardware appliance platforms that aruba provides?",
        "how to Powering Off the ClearPass Hardware Appliance?",
        "what are the Supported Hypervisors for clearpass?")

    generic_questions = [(x, 'generic') for x in generic_questions]
    technical_questions = [(x, 'tech') for x in technical_questions]

    training_set = []
    training_set.extend(generic_questions)
    training_set.extend(technical_questions)

    Qclassifier = NaiveBayesClassifier(training_set)
    save_classifier = open("naivebayes.pickle", "wb")
    pickle.dump(Qclassifier, save_classifier)
    save_classifier.close()
Ejemplo n.º 6
0
print("It took " + str(time.time() - a) + " seconds to import data")
print('data imported')

# randomize the data to do a test against the corpus
random.seed(1)
random.shuffle(entire_data)

#train = entire_data[:30]
test = entire_data[
    1:10]  # test a random 10 sentences for classification accuracy
print('training data')
a = time.time()

################ CLASSIFY ###################

cl = NaiveBayesClassifier(
    entire_data)  # create the classifier cl from the Trained data

###############  Accuracy ##################
print("It took " + str(time.time() - a) + " seconds to train data")
print('data trained, now checking accuracy:')

accuracy = cl.accuracy(test)  # Accuracy of the Classifier vs the Corpus
print("accuracy: " + str(accuracy))

stop = set(stopwords.words('english'))


def removestopword(textwords):
    finaltext1 = []
    for word in textwords.words:
        if word not in stop and not word.isdigit():
Ejemplo n.º 7
0
import pickle
import csv
from textblob.classifiers import NaiveBayesClassifier

with open("new_sample.tsv", encoding="utf-8", newline='') as f:
    #dialeto = csv.Sniffer().sniff(f.read(1024), delimiters="\t")
    dialeto = csv.Sniffer().sniff(
        f.read(4096), delimiters='\t')  # Coloquei 4096 bytes para ler
    f.seek(0)
    dado = csv.reader(f, dialeto)
    lista = []
    for l in dado:
        lista.append(tuple(l))
    cl = NaiveBayesClassifier(lista)
    g = open("trained.pickle", "wb")
    pickle.dump(cl, g)
    g.close()
Ejemplo n.º 8
0
    pos_counts = Counter()

    pos_counts["n"] = len([item for item in probable_part_of_speech if item.pos() == "n"])
    pos_counts["v"] = len([item for item in probable_part_of_speech if item.pos() == "v"])
    pos_counts["a"] = len([item for item in probable_part_of_speech if item.pos() == "a"])
    pos_counts["r"] = len([item for item in probable_part_of_speech if item.pos() == "r"])

    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech


def lemmatize_words_and_sentence(word):
    tokenized_words = word_tokenize(word)
    return " ".join([lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized_words])


def add_to_training(word, category):
    word = word.lower()
    word = lemmatize_words_and_sentence(word)
    train.append((word, str(category)))
    check_list.append(word)


for keyword in keywords:
    add_to_training(keyword[0], keyword[1])

cl_NB = NaiveBayesClassifier(train)

joblib.dump(cl_NB, 'cl_NB.pkl')
def clasify(data):
    cl = NaiveBayesClassifier(data)
    return cl
def machinelearning():
    import random, time, nltk, csv, threading
    from textblob import TextBlob
    from nltk.corpus import stopwords
    from textblob.classifiers import NaiveBayesClassifier

    warnings.filterwarnings("ignore")

    # from wandb import magic
    # import wandb
    # wandb.init(magic=True)
    # wandb.init(project="uncategorized")
    # add file paths here
    file1 = "/home/blackfalcon/gitstuff/Detecting-Spoof-Emails-with-Information-Fusion/Dataset/SMSSpamCollection"
    file2 = "/home/blackfalcon/gitstuff/Detecting-Spoof-Emails-with-Information-Fusion/Dataset/SMSSpamCollection"

    # we calculate the row count and and the training amount we are going to use for
    # our classifier the current dataset current has around 6k or spam and ham (mixed)
    row_count = len(list(csv.reader(open(file1))))
    print(row_count)
    dothis = row_count - 1
    # using int to round the train amount (Lower BOUND)
    trainamount = int(row_count / 4)
    print(trainamount)
    # Since the train amount is going to be 1/4 of the data set we need to increment
    # by 1 so that we start classifying the next row and until the end of the file
    therest = trainamount + 1
    print(therest)

    # bigchungas 55k unclassified
    big_count = len(list(csv.reader(open(file2))))
    big_counter = big_count - 1
    print(big_count)

    # using stop words causes a massive INCREASE in import time so we have to use
    # a specific one to reduce the time taken , for example "english"
    # if left blank the stopwords function of NLTK searches all of its dictionaries
    # around 24! so if on average it takes 15 seconds to check it will take 245s
    # to check all of them not including sorting them into tuples
    def get_list_tuples(read_file):
        list_tuples = []
        with open(read_file, "r", encoding="utf-8", errors="ignore") as r:
            c = 0
            for line in r:
                tabsep = line.strip().split("\t")
                msg = TextBlob(tabsep[1])
                try:
                    words = msg.words
                except:
                    continue
                for word in words:
                    if word not in stopwords.words("english") and not word.isdigit():
                        list_tuples.append((word.lower(), tabsep[0]))
                c += 1
                if c == row_count:
                    break
            return list_tuples

    # used for the super extreme case
    def get_list_spam(read_file):
        list_tuples = []
        with open(read_file, "r", encoding="utf-8", errors="ignore") as r:
            c = 0
            for line in r:
                tabsep = line.strip().split("\t")
                msg = TextBlob(tabsep[1])
                try:
                    words = msg.words
                except:
                    continue
                for word in words:
                    if word not in stopwords.words("english") and not word.isdigit():
                        list_tuples.append((word.lower(), tabsep[0]))
                c += 1
                # print(c)
                if c == big_counter:
                    break
            return list_tuples

    print("importing data...")
    a = time.time()
    entire_data = get_list_tuples(file1)
    unknown_data = get_list_spam(file2)

    print("It took " + str(time.time() - a) + " seconds to import data")
    print("data imported")
    print("shuffle the data")
    random.seed(1)
    random.shuffle(entire_data)
    random.shuffle(unknown_data)

    # train = entire_data[:row_count]
    # test = entire_data[:row_count]

    train = entire_data[:row_count]
    # train = unknown_data[1:2000]
    test = unknown_data[:big_count]
    print("training data")
    a = time.time()
    cl = NaiveBayesClassifier(train)
    # cl2 = MaxEntClassifier(train)
    # cl3 = DecisionTreeClassifier("call the police")
    # Timing and calculate accuracy
    print("It took " + str(time.time() - a) + " seconds to train data")
    print("data trained, now checking accuracy:")

    a = time.time()
    accuracy = cl.accuracy(test)
    # acc2 = cl2.accuracy(test)
    print("accuracy: " + str(accuracy))
    # print ("accuracy: "+str(acc2))
    print("It took " + str(time.time() - a) + "to calculate the accuracy")
    print(cl.classify("Oops, I'll let you know when my roommate's done"))  # ham
    print(
        cl.classify(
            "Get a brand new mobile phone by being an agent of The Mob! Plus loads more goodies! For more info just text MAT to 87021"
        )
    )  # spam
    print(
        cl.classify(
            "Doctors hate him, see how this man grew his dick upto six inches with this new method!"
        )
    )  # spam
    print(cl.classify("You just won $32432840928432 zimbabewewewewew dolla "))
    # from google.colab import output
    # output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
    return cl
Ejemplo n.º 11
0
            "zYLExHmQTGmshsADPuQ2pHJZxyd3p1La1XNjsnCr1Pialvj71e",
            "Content-Type": "application/x-www-form-urlencoded",
            "Accept": "application/json"
        },
        params={"txt": string})
    global count
    print(count)
    count += 1
    return response.body['result']['sentiment']


couch = couchdb.Server()
db = couch['sandiegotweets']

train_data = [(db1[id]['text'].encode('ascii', 'ignore'),
               get_response(db1[id]['text'].encode('ascii', 'ignore')))
              for id in db]

joblib.dump(
    train_data, 'train_data.pkl'
)  #store the training data (including the results of Sentiment API)
#train_data = joblib.load('train_data.pkl')#load the training data (including the results of Sentiment API)

classifier = NaiveBayesClassifier(train_data)  #classify the training data

print(classifier.classify("Their burgers are amazing."))
print(classifier.classify("you look so great today."))
print(classifier.classify("this place is so beautiful"))

pickle.dump(classifier, output)  #store the model (for next time usage)
Ejemplo n.º 12
0
def get_cl():
    train = build_train_data()
    cl = NaiveBayesClassifier(train)
    return cl
Ejemplo n.º 13
0
        print(e)
        return

    modules = doc.xpath(
        "//h3[text()='Results']/following-sibling::ul//li[text()='tests']/ul/li"
    )
    csvfile = open('results.csv', 'wb')
    csvWriter = csv.writer(csvfile,
                           delimiter=",",
                           quotechar="|",
                           quoting=csv.QUOTE_MINIMAL)
    # Write the column names
    csvWriter.writerow(
        ["#", "Module", "TestCase", "Result", "FailureReason", "FailureType"])
    with open('train.csv', 'r') as trdata:
        cl = NaiveBayesClassifier(trdata, format="csv")
    tcCount = 0
    for module in modules:
        moduleName = module.xpath("./text()")[0]

        tc_lnks = module.xpath(".//li[contains(@class,'TestCaseImpl')]")
        for tc in tc_lnks:
            tchref = tc.xpath("./a/@href")[0]
            tcName = tc.xpath("./a/text()")[0]
            tcresulttext = "pass" if "Successful" in tc.xpath(
                "./@class")[0] else "fail"
            failure = ''
            failureType = ''
            if tcresulttext == "fail":
                tcdoc = html.parse(os.path.join(TEST_DIRECTORY, tchref))
                failure = getTestcasefailure(tcdoc)
Ejemplo n.º 14
0
print("3.MaxEnt         3.NLTK\n")
choice = input("Select one classifier number: ")

# for testing with different dataset sizes
# size = input("n: ")
# trains = []
# for i in range(int(size)):
#     trains.append(train[i])
# for i in range(250, int(size)+250):
#     trains.append(train[i])

trains = train

if choice == "1":
    print("\n" + "#NaiveBayesClassifier")
    cl1 = NaiveBayesClassifier(trains)
    print("Classifier: Naive Bayes -- Accuracy: ", cl1.accuracy(test), "\n")

elif choice == "2":
    print("\n" + "#DecisionTreeClassifier")
    cl2 = DecisionTreeClassifier(trains)
    print("Classifier: Decision Tree -- Accuracy: ", cl2.accuracy(test), "\n")

elif choice == "3":
    print("\n" + "#MaxEntClassifier")
    cl3 = MaxEntClassifier(trains)
    print("Classifier: Maximum Entropy -- Accuracy: ", cl3.accuracy(test),
          "\n")

elif choice == "4":
    print("\n" + "#NLTKClassifier")
Ejemplo n.º 15
0
def dollarSA():
    r = Rake()
    # Opens file and reads in training data
    # NB classifier trains using the read in data
    with open("../datasets/trainingData.csv", 'r') as trainingdata:
        classifier = NaiveBayesClassifier(trainingdata, format="csv")
        print("Training Data")
        classifier.show_informative_features(5)

    # Opens file and reads in testing data
    # Prints testing data accuracy
    # Not needed for final product

    with open("../datasets/testingData.csv", 'r') as testingdata:
        print("Testing data accuracy", classifier.accuracy(testingdata))

    with open("dollar.txt", 'r', encoding='utf-8') as a_file:
        for line in a_file:
            userInput = line.strip()

            regex = re.compile('[^a-zA-Z ]')
            punctuationRemoved = regex.sub('', userInput)

            # Defines stopwords
            stop_words = set(stopwords.words('english'))

            # Takes user input, removes stopwords
            word_tokens = word_tokenize(punctuationRemoved)

            # Creates list size based on number of words left after stop words are removed
            filtered_sentence = [w for w in word_tokens if not w in stop_words]

            # Initialize empty list
            filtered_sentence = []

            # Appends each word to end of list
            # Runs for as many words are stored in word_tokens
            for w in word_tokens:
                # If word is not in stop_words, append to end of list
                if w not in stop_words:
                    filtered_sentence.append(w)

            # Prints list to see new sentence with stopwords removed

            # Converts the filtered stop word sentence to string
            stringWithoutStopwords = ' '.join(
                [str(elem) for elem in filtered_sentence])

            # Extracts keywords from the filtered sentence
            r.extract_keywords_from_text(stringWithoutStopwords)

            # Ranks the keywords that have been extracted
            ranked_phrases = r.get_ranked_phrases()

            # Converts extracted keywords list to string
            listToStr = ' '.join([str(elem) for elem in ranked_phrases])

            # Runs string through trained NB classifier
            finalString = TextBlob(listToStr, classifier=classifier)

            # Print string followed by classification
            print(finalString + "," + finalString.classify())
Ejemplo n.º 16
0
# combine title and review text
df['test'] = df['Title'] + ' ' + df['Review Text']

# split into 80% train 20% test
np.random.seed(0)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

subset_train = train[['test', 'Recommended IND']]
tuples_train = [tuple(x) for x in subset_train.values]
subset_test = test[['test', 'Recommended IND']]
tuples_test = [tuple(x) for x in subset_test.values]

cl = NaiveBayesClassifier(tuples_train)

# [x[0] for x in tuples_test]
# cl.classify(tuples_test[10][0])

# cl.classify("This is an amazing library!")

predicted_classifications = []


def make_predictions():
    for i in range(len(tuples_test)):
        classification = cl.classify(tuples_test[i][0])
        predicted_classifications.append(classification)

Ejemplo n.º 17
0
working_on_utterances = ('what are you coding', 'what are you making',
                         'what are you doing',
                         'what are you doing at the moment',
                         'what are you working on', 'what you making')

experience_utterances = [(x, 'experience') for x in experience_utterances]
environment_utterances = [(x, 'enivornment') for x in environment_utterances]
working_on_utterances = [(x, 'working') for x in working_on_utterances]

# FIXME: find better way to flatten lists together
training_set = []
training_set.extend(experience_utterances)
training_set.extend(environment_utterances)
training_set.extend(working_on_utterances)

classifier = NaiveBayesClassifier(training_set)
print(classifier.show_informative_features(), classifier.labels())

bogus_utterances = (
    'if you going to use nltk u may want to check this out spacy .io',
    'sup people? I see the weather\'s getting better over there, Ben.',
    'i had the same problem your having so thats my i made my own.',
    'try http, instead of https')

# TODO: Figure out how to make this stronger
dual_utterance = ('how long have you been coding and what IDE do you use', )

test_utterances = ('what are you making',
                   'hey that nyancat is cool, how do you get that?')

for t in test_utterances:
Ejemplo n.º 18
0
def textrecogition():
    feelingtext_data = [
        ('I feel good.', 'pos'), ('It can not be better', 'pos'),
        ('the dinner is delicious', 'pos'), ('a nice day', 'pos'),
        ("yesterday is bad. but today everythings can not be better", 'pos'),
        ('today is bad', 'neg'), ('i feel rather horrible', 'neg'),
        ("i am so embarrassed", 'neg'), ('so bad a film!', 'neg'),
        ('/cy /cy.', 'pos'), ('passed away', 'pos'),
        ('I missed you so much', 'neg'), ('can not be worse', 'neg'),
        ('bad weather', 'neg'), ("I never want to do this again", 'neg'),
        ('I do not enjoy my job', 'neg'), ('i am so sad', 'neg'),
        ("i am too hot", 'neg'), ('I love this film', 'neg'),
        ('/cy /cy.', 'pos'), ('/ll /ll.', 'neg'), ('i hate exams', 'neg')
    ]
    cl = NaiveBayesClassifier(feelingtext_data)
    circleMessage = pd.read_csv('moment.txt', index_col=False, header=0)
    serie = circleMessage.transpose()

    if cl.classify(serie) == 'neg':
        predicted.words = -1
    else:
        predicted.words = 1

    if len(serie) == 0:
        predicted.words = 0

    shape = (1, 8, 8)
    blocks1 = np.lib.stride_tricks.as_strided(imread("1.jpg"), shape=shape)
    blocks2 = np.lib.stride_tricks.as_strided(imread("2.jpg"), shape=shape)
    blocks3 = np.lib.stride_tricks.as_strided(imread("3.jpg"), shape=shape)
    blocks4 = np.lib.stride_tricks.as_strided(imread("4.jpg"), shape=shape)
    blocks5 = np.lib.stride_tricks.as_strided(imread("5.jpg"), shape=shape)
    blocks6 = np.lib.stride_tricks.as_strided(imread("6.jpg"), shape=shape)
    blocks7 = np.lib.stride_tricks.as_strided(imread("7.jpg"), shape=shape)
    blocks8 = np.lib.stride_tricks.as_strided(imread("8.jpg"), shape=shape)
    blocks9 = np.lib.stride_tricks.as_strided(imread("9.jpg"), shape=shape)
    blocks10 = np.lib.stride_tricks.as_strided(imread("10.jpg"), shape=shape)
    blocks11 = np.lib.stride_tricks.as_strided(imread("11.jpg"), shape=shape)
    blocks12 = np.lib.stride_tricks.as_strided(imread("12.jpg"), shape=shape)
    blocks13 = np.lib.stride_tricks.as_strided(imread("13.jpg"), shape=shape)
    blocks14 = np.lib.stride_tricks.as_strided(imread("14.jpg"), shape=shape)
    blocks15 = np.lib.stride_tricks.as_strided(imread("15.jpg"), shape=shape)
    blocks16 = np.lib.stride_tricks.as_strided(imread("16.jpg"), shape=shape)
    blocks17 = np.lib.stride_tricks.as_strided(imread("17.jpg"), shape=shape)
    blocks18 = np.lib.stride_tricks.as_strided(imread("18.jpg"), shape=shape)
    blocks19 = np.lib.stride_tricks.as_strided(imread("19.jpg"), shape=shape)
    blocks20 = np.lib.stride_tricks.as_strided(imread("20.jpg"), shape=shape)
    blocks21 = np.lib.stride_tricks.as_strided(imread("21.jpg"), shape=shape)
    blocks22 = np.lib.stride_tricks.as_strided(imread("22.jpg"), shape=shape)
    blocks23 = np.lib.stride_tricks.as_strided(imread("23.jpg"), shape=shape)
    blocks24 = np.lib.stride_tricks.as_strided(imread("24.jpg"), shape=shape)
    blocks25 = np.lib.stride_tricks.as_strided(imread("25.jpg"), shape=shape)
    blocks26 = np.lib.stride_tricks.as_strided(imread("26.jpg"), shape=shape)

    a1 = np.vstack((blocks1, blocks2))
    a2 = np.vstack((blocks3, blocks4))
    a3 = np.vstack((blocks5, blocks6))
    a4 = np.vstack((blocks7, blocks8))
    a5 = np.vstack((blocks9, blocks10))
    a6 = np.vstack((blocks11, blocks12))
    a7 = np.vstack((blocks13, blocks14))
    a8 = np.vstack((blocks15, blocks16))
    a9 = np.vstack((blocks17, blocks18))
    a10 = np.vstack((blocks19, blocks20))
    a11 = np.vstack((blocks21, blocks22))
    a12 = np.vstack((blocks23, blocks24))
    a13 = np.vstack((blocks25, blocks26))
    a14 = np.vstack((a1, a2))
    a15 = np.vstack((a3, a4))
    a16 = np.vstack((a5, a6))
    a17 = np.vstack((a7, a8))
    a18 = np.vstack((a9, a10))
    a19 = np.vstack((a11, a12))

    a20 = np.vstack((a13, a14))
    a21 = np.vstack((a15, a16))
    a22 = np.vstack((a17, a18))
    a23 = np.vstack((a19, a20))
    a24 = np.vstack((a21, a22))
    a25 = np.vstack((a23, a24))
    shape = (26, 64)
    blocks = np.lib.stride_tricks.as_strided(a25, shape=shape)
    target = [
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 0
    ]
    imagesdataset = {
        'images':
        a25,
        'data':
        blocks,
        'target': [
            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
            1, 1, 1, 0
        ]
    }

    svc_1 = SVC(kernel='linear')
    clf = svc_1
    clf.fit(blocks, target)
    correctrate = clf.score(blocks, target)
    blocksdepress = np.lib.stride_tricks.as_strided(imread("depress.jpg"),
                                                    shape=(1, 64))
    imagerecog = clf.predict(blocksdepress)

    if imagerecog == [1]:
        print("the image should be positive")
        print("the images recognition accuracy is %f" % correctrate)
        predicted.colors = 1

    else:
        print("the image should be negative")
        print("the images recognition accuracy is %f" % correctrate)
        predicted.colors = 0
Ejemplo n.º 19
0
texts = ["Ramiess sings classic songs",
         "he listens to old pop",
         "and rock music",
         "and also listens to classical songs"]
cv = CountVectorizer()
cv_fit = cv.fit_transform(texts)
print(cv.get_feature_names())
print(cv_fit.toarray())

# 1.4.7 TF-IDF分数
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
X = vect.fit_transform(texts)
print(X.todense())

# 1.4.8 文本分类器
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

data = [
    ("I love my country", 'pos'),
    ("This is an amazing place!", 'pos'),
    ("I do not like the smell of this place", 'neg'),
    ("I do not like this restaurant", 'neg'),
    ("I am tiredd of hearing your nonsense", 'neg'),
    ("I always aspire to be like him", "pos"),
    ("It's a horrible performance.", "neg")
]

model = NaiveBayesClassifier(data)
print(model.classify("It's an awesome place"))
Ejemplo n.º 20
0
import textblob
from textblob.classifiers import NaiveBayesClassifier

with open('final.json', 'r') as fp:
    cl = NaiveBayesClassifier(fp, format="json")
Ejemplo n.º 21
0
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob


# ISSUE : training is case sesitive !!!

print "INFO : Model Initialization - BEGIN"

with open('training.csv', 'r') as fp:
     cl = NaiveBayesClassifier(fp, format="csv")

print "INFO : Model Initialization - END"

def classify(message):
  tags = cl.classify(message.lower())
 
  print "DEBUG : '",message,"'  - Result " , tags 
  return tags 

Ejemplo n.º 22
0
    movieid = int(line.split(',')[0])
    tags = line.split(',')[1].replace('\n', '')
    if count == 1500:
        break
    if movieid in rating_dict.keys():
        traindata.append((tags, rating_dict[movieid]))
        #trainmovies.append(movieid)
        testdata.append(tags)
        testresult.append(rating_dict[movieid])
        #X.append(tags)
        #y.append(rating_dict[movieid])
        count += 1
print(count, ' number of movies available')
fr.close()

model = NaiveBayesClassifier(traindata)
#model.fit(traindata)
"""
#--------------------get testdata---------------------------------------
fr=open('tags_modified.csv')
line=fr.readline()
count=0
checkmovie=11
testdata=[]
testresult=[]
for line in fr:
    movieid=int(line.split(',')[0])
    tags=line.split(',')[1].replace('\n','')
    if int(movieid) in trainmovies:
    testdata.append(tags)
    testresult.append(rating_dict[movieid])    
Ejemplo n.º 23
0
                   ('Entropy', 'Relationship Discovery'),
                   ('Probabilities', 'Relationship Discovery'),
                   ('paradigmatic', 'Relationship Discovery'),
                   ('collocations', 'Relationship Discovery'),
                   ('Topic Models',
                    'Topic Models,Clustering & Categorization'),
                   ('LDA', 'Topic Models,Clustering & Categorization'),
                   ('PLSA', 'Topic Models,Clustering & Categorization'),
                   ('Clustering', 'Topic Models,Clustering & Categorization'),
                   ('Latent', 'Opinion Mining & Sentiment Analysis'),
                   ('Opinion', 'Opinion Mining & Sentiment Analysis'),
                   ('Prediction', 'Contextual Text Mining'),
                   ('Contextual', 'Contextual Text Mining')]

#Instantiating the NB Classifier - Simple
classifier = NaiveBayesClassifier(featureListTrain)

#Random Shuffling of data for consistency
random.shuffle(data)

#print(str(data[0][1]).split('::'))

#Split Corpus data into train and test datasets
train, test = data[0:10], data[11:23]

#Update Classifier with new corpus data
classifier.update(train)

# Compute accuracy
accuracy = classifier.accuracy(featureListTest + test + data)
print("Accuracy: {0}".format(accuracy))
Ejemplo n.º 24
0
def train_all(train_set):
    extract_all()
    final_train_set = combine_datasets(train_set)
    cl = NaiveBayesClassifier(final_train_set)
    pickle.dump(cl, open('model.pkl', 'wb'))
Ejemplo n.º 25
0
"""
# stemming process
sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan'
output   = stemmer.stem(sentence)

print(output)
# ekonomi indonesia sedang dalam tumbuh yang bangga
"""

from sklearn.cross_validation import train_test_split

X_train, X_test = train_test_split(textblob, test_size=0.2, random_state=80)

from textblob.classifiers import NaiveBayesClassifier

clf = NaiveBayesClassifier(textblob)
clf.accuracy(X_test)

testing = 'Presiden rindujabarjuara etis rusuh'
clean_test = preprocess_tweet(testing)
prob_dist = clf.prob_classify(blob)

pos_prob = round(prob_dist.prob("1"), 2)
neg_prob = round(prob_dist.prob("0"), 2)

print(float(round(prob_dist.prob("0"), 2)))

round(prob_dist.prob("1"), 2)
round(prob_dist.prob("2"), 2)

clf.show_informative_features(15)
Ejemplo n.º 26
0
# CSV files
data_folder = Path("C:/Users/adam.batchelor/Desktop/")
file_to_open = data_folder / "train5.csv"

comments = 'O:\\10. Workspace\DI Jiras\DI-670  Improve sentiment scoring of rating comments and deliver insights\Sushma\\csv comments_280720.csv'

# list for sentiment score
out_list = []
# list for translated comment
#trans_list = []

# Reading from training files
with open(file_to_open, 'r', encoding="utf8") as f:
    text = list(csv.reader(f))

cl = NaiveBayesClassifier(text)

# reading comments - insert new every run
df = pd.read_csv(comments)

# writing output to csv - with sentiment,score and translation
for index, row in df.iterrows():
    translation = translator.translate(row['review_comment'], dest="en")
    line = translation.text
    prob_dist = cl.prob_classify(TextBlob(line, classifier=cl))
    # print(line + str((prob_dist.prob("pos"))) + '\n')
    # trans_list.append(line) #purpose of this?

    out_list.append(round(5 * (prob_dist.prob("pos")), 1))

df["sentiment_score"] = out_list
Ejemplo n.º 27
0
    i = raw_input("are you satistfied ? ")
    if i == "y":
        return True
    if i == 'n':
        return False
    else:
        print " y or n please"
        return self._ask_about_result()


if __name__ == '__main__':
    print "Hello"

    data = load_sample()

    splitIndex = 2 * len(data) / 3
    train = data[:splitIndex]
    test = data[splitIndex:]

    cl = NaiveBayesClassifier(train)

    for item in test:
        print_item(item)

    print "accuarciy", cl.accuracy(test)

    happy = _ask_about_result()
    if happy:
        with open('classifier.pickle', "wb") as f:
            pickle.dump(cl, f)
Ejemplo n.º 28
0
 def create_classifier(self, fname):
     with open(fname, 'r') as fp:
         cl = NaiveBayesClassifier(fp, format='csv')
     return cl
Ejemplo n.º 29
0
from textblob.classifiers import NaiveBayesClassifier
trainData = []
f = open('TrainSet.txt', 'r')
data = f.readline().strip()
while data:
    splitData = data.split(',')
    category = splitData[0]
    content = splitData[1]
    tuple = content, category
    trainData.append(tuple)
    data = f.readline().strip()
classifier = NaiveBayesClassifier(trainData)
print "Training Done"
f.close()
f1 = open('TestSet.txt', 'r')
data = f1.read()
if classifier.classify(data) == 'C01':
    print "Bacterial Infections and Mycoses"
else:
    print "Virus Diseases"
Ejemplo n.º 30
0
romney_tweets_raw = romney_tweets_raw.tolist()
obama_class_train = obama_class.tolist()
romney_class_train = romney_class.tolist()

romney_tweets = dataClean(romney_tweets_raw)  #romney tweets cleaning
obama_tweets = dataClean(obama_tweets_raw)  #obama tweets cleaning

obama_merged = zip(obama_tweets, obama_class_train)
obama_merged = list(obama_merged)

romney_merged = zip(romney_tweets, romney_class_train)
romney_merged = list(romney_merged)

# In[4]:

c1 = NaiveBayesClassifier(obama_merged)
c2 = NaiveBayesClassifier(romney_merged)

# In[5]:

testingFile = "test.xlsx"
df_obama_test = pd.read_excel(testingFile, sheetname='Obama')
df_romney_test = pd.read_excel(testingFile, sheetname='Romney')

#Removing the mixed class and the !!! class

df_obama_test = df_obama_test[(df_obama_test['Class'].isin((1, -1, 0)))]
df_romney_test = df_romney_test[(df_romney_test['Class'].isin((1, -1, 0)))]

#creating lists for raw tweets and classes