def makeTraining(lemma=False, stop=False, count=False, weight=False): print "\n\tMaking the corpus..." c = Corpus("corpus/preprocessed", lemma, stop, count, weight); featuresets = [] for p in c.post_names: if not count and not weight: featuresets.append((dict([('%s' %tuple2str(f), 1) for f in c.posts[p].features]), c.posts[p].label)) # print featuresets elif not weight: featuresets.append((dict([('%s' %tuple2str(f), c.posts[p].feature_count[f[0]]) for f in c.posts[p].features]), c.posts[p].label)) else: featuresets.append((dict([('%s' %tuple2str(f), c.posts[p].feature_count[f[0]]*c.sentweight[f]) for f in c.posts[p].features]), c.posts[p].label)) avg = 0 test_set = [] train_set = [] for i in range(5): for j in range(250): if j % 5 == i: test_set.append(featuresets[j]) else: train_set.append(featuresets[j]) classifier = nltk.classify.maxent.MaxentClassifier.train(train_set, count_cutoff=0, algorithm='GIS', max_iter=25) print "\n\tMost Informative Features\n\t-------------------------------------\n" classifier.show_most_informative_features(10) avg += nltk.classify.accuracy(classifier, test_set) print "\n\tAccuracy on Test Data %s\n\t-------------------------------------\n" %"{:.3f}".format(avg/5) print "\n\n"
def process(self, msg): """ Does the POS tagging """ try: if msg['lang'] == 'en': sents = eng_sent_tokenizer.tokenize(msg['text']) elif msg['lang'] == 'pt': sents = port_sent_tokenizer.tokenize(msg['text']) tokens = [] for s in sents: tokens.extend(pos_tag(word_tokenize(s))) tagged_text = ' '.join([tuple2str(t) for t in tokens]) msgout = { "database": msg['database'], "collection": msg['collection'], "spec": { "_id": msg['_id'] }, "update": { "$set": { 'tagged_text': tagged_text } }, "multi": False } self.sender.send_json(msgout) except: self.sender.send_json({'fail': 1})
def _join(self, lst, sep=' ', untag=False): try: return sep.join(lst) except TypeError: if untag: return sep.join(tup[0] for tup in lst) from nltk.tag import tuple2str return sep.join(tuple2str(tup) for tup in lst)
def tag_text(text, lang='en'): """ Receives raw text and returns tagged text in the format word/tag """ if lang == 'en': sents = eng_sent_tokenizer.tokenize(text) elif lang == 'pt': sents = port_sent_tokenizer.tokenize(text) tokens = [] for s in sents: tokens.extend(pos_tag(word_tokenize(s))) return ' '.join([tuple2str(t) for t in tokens])
def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. @param untag: if C{True}, omit the tag from tagged input strings. @type lst: C{list} @rtype: C{str} """ try: return join(lst, sep=sep) except TypeError: if untag: return join([tup[0] for tup in lst], sep=sep) from nltk.tag import tuple2str return join([tuple2str(tup) for tup in lst], sep=sep)
def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return sep.join(lst) except TypeError: if untag: return sep.join(tup[0] for tup in lst) from nltk.tag import tuple2str return sep.join(tuple2str(tup) for tup in lst)
def process(self,msg): """ Does the POS tagging """ try: if msg['lang'] == 'en': sents = eng_sent_tokenizer.tokenize(msg['text']) elif msg['lang'] == 'pt': sents = port_sent_tokenizer.tokenize(msg['text']) tokens = [] for s in sents: tokens.extend(pos_tag(word_tokenize(s))) tagged_text = ' '.join([tuple2str(t) for t in tokens]) msgout = {"database":msg['database'],"collection":msg['collection'],"spec":{"_id":msg['_id']},"update":{"$set":{'tagged_text':tagged_text}},"multi":False} self.sender.send_json(msgout) except: self.sender.send_json({'fail':1})
def untagging(string): untag = "" for t in string: untag = untag + tuple2str(t) + " " return untag
#print(tempArr) my_list.append(tempArr) except: continue #print(my_list) #KODE DIBAWAH INI ADALAH UNTUK MERUBAH KATA KE KATA DASAR #lemmatization -> mengubah sebuah kata ke kata dasar (lemma) yang ada pada kamus wordnet_lemmatizer = WordNetLemmatizer() listSentencePure = list() for element in my_list: tempStr = "" for nestedElement in element: tempStr += nestedElement + " " listSentencePure.append(tempStr[:-1:]) #POSTAGGER for sentence in listSentencePure: tagged = pos_tag(word_tokenize(sentence)) print([tag.tuple2str(t) for t in tagged]) #TODO AMBIL ADJEKTIVE # # #