def makeTraining(lemma=False, stop=False, count=False, weight=False):
    print "\n\tMaking the corpus..."
    c = Corpus("corpus/preprocessed", lemma, stop, count, weight);
    
    featuresets = []
    for p in c.post_names:
        if not count and not weight:
            featuresets.append((dict([('%s' %tuple2str(f), 1) for f in c.posts[p].features]), c.posts[p].label))
            # print featuresets
        elif not weight:
            featuresets.append((dict([('%s' %tuple2str(f), c.posts[p].feature_count[f[0]]) for f in c.posts[p].features]), c.posts[p].label))
        else:
            featuresets.append((dict([('%s' %tuple2str(f), c.posts[p].feature_count[f[0]]*c.sentweight[f]) for f in c.posts[p].features]), c.posts[p].label))
    avg = 0
    test_set = []
    train_set = []
    for i in range(5):
        for j in range(250):
            if j % 5 == i:
                test_set.append(featuresets[j])
            else:
                train_set.append(featuresets[j])
        classifier = nltk.classify.maxent.MaxentClassifier.train(train_set, count_cutoff=0, algorithm='GIS',  max_iter=25)
        print "\n\tMost Informative Features\n\t-------------------------------------\n"
        classifier.show_most_informative_features(10)
        avg += nltk.classify.accuracy(classifier, test_set)
    print "\n\tAccuracy on Test Data           %s\n\t-------------------------------------\n" %"{:.3f}".format(avg/5)
    print "\n\n"
Esempio n. 2
0
    def process(self, msg):
        """
        Does the POS tagging
        """
        try:
            if msg['lang'] == 'en':
                sents = eng_sent_tokenizer.tokenize(msg['text'])
            elif msg['lang'] == 'pt':
                sents = port_sent_tokenizer.tokenize(msg['text'])

            tokens = []
            for s in sents:
                tokens.extend(pos_tag(word_tokenize(s)))

            tagged_text = ' '.join([tuple2str(t) for t in tokens])
            msgout = {
                "database": msg['database'],
                "collection": msg['collection'],
                "spec": {
                    "_id": msg['_id']
                },
                "update": {
                    "$set": {
                        'tagged_text': tagged_text
                    }
                },
                "multi": False
            }
            self.sender.send_json(msgout)
        except:
            self.sender.send_json({'fail': 1})
 def _join(self, lst, sep=' ', untag=False):
     try:
         return sep.join(lst)
     except TypeError:
         if untag:
             return sep.join(tup[0] for tup in lst)
         from nltk.tag import tuple2str
         return sep.join(tuple2str(tup) for tup in lst)
Esempio n. 4
0
def tag_text(text, lang='en'):
    """
    Receives raw text and returns tagged text in the  format word/tag
    """
    if lang == 'en':
        sents = eng_sent_tokenizer.tokenize(text)
    elif lang == 'pt':
        sents = port_sent_tokenizer.tokenize(text)
    tokens = []
    for s in sents:
        tokens.extend(pos_tag(word_tokenize(s)))
    return ' '.join([tuple2str(t) for t in tokens])
Esempio n. 5
0
def tag_text(text, lang='en'):
    """
    Receives raw text and returns tagged text in the  format word/tag
    """
    if lang == 'en':
        sents = eng_sent_tokenizer.tokenize(text)
    elif lang == 'pt':
        sents = port_sent_tokenizer.tokenize(text)
    tokens = []
    for s in sents:
        tokens.extend(pos_tag(word_tokenize(s)))
    return ' '.join([tuple2str(t) for t in tokens])
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    @param untag: if C{True}, omit the tag from tagged input strings.
    @type lst: C{list}
    @rtype: C{str}
    """
    try:
        return join(lst, sep=sep)
    except TypeError:
        if untag:
            return join([tup[0] for tup in lst], sep=sep)
        from nltk.tag import tuple2str
        return join([tuple2str(tup) for tup in lst], sep=sep)
Esempio n. 7
0
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
Esempio n. 8
0
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    @param untag: if C{True}, omit the tag from tagged input strings.
    @type lst: C{list}
    @rtype: C{str}
    """
    try:
        return join(lst, sep=sep)
    except TypeError:
        if untag:
            return join([tup[0] for tup in lst], sep=sep)            
        from nltk.tag import tuple2str
        return join([tuple2str(tup) for tup in lst], sep=sep) 
Esempio n. 9
0
    def process(self,msg):
        """
        Does the POS tagging
        """
        try:
            if msg['lang'] == 'en':
                sents = eng_sent_tokenizer.tokenize(msg['text'])
            elif msg['lang'] == 'pt':
                sents = port_sent_tokenizer.tokenize(msg['text'])
            
            tokens = []
            for s in sents:
                tokens.extend(pos_tag(word_tokenize(s)))

            tagged_text = ' '.join([tuple2str(t) for t in tokens])
            msgout = {"database":msg['database'],"collection":msg['collection'],"spec":{"_id":msg['_id']},"update":{"$set":{'tagged_text':tagged_text}},"multi":False}
            self.sender.send_json(msgout)
        except:
            self.sender.send_json({'fail':1})
Esempio n. 10
0
def untagging(string):
    untag = ""
    for t in string:
        untag = untag + tuple2str(t) + " "
    return untag
Esempio n. 11
0
        #print(tempArr)
        my_list.append(tempArr)
    except:
        continue

#print(my_list)

#KODE DIBAWAH INI ADALAH UNTUK MERUBAH KATA KE KATA DASAR
#lemmatization -> mengubah sebuah kata ke kata dasar (lemma) yang ada pada kamus
wordnet_lemmatizer = WordNetLemmatizer()

listSentencePure = list()

for element in my_list:
    tempStr = ""

    for nestedElement in element:
        tempStr += nestedElement + " "

    listSentencePure.append(tempStr[:-1:])

#POSTAGGER
for sentence in listSentencePure:
    tagged = pos_tag(word_tokenize(sentence))

    print([tag.tuple2str(t) for t in tagged])

#TODO AMBIL ADJEKTIVE
#
#
#