Esempio n. 1
0
    def __init__(self,
                 train_sents=None,
                 tagger="ClassifierBasedTagger",
                 model=None,
                 model_name="../results/modelCRF_featured",
                 entities=None,
                 language="english",
                 **kwargs):

        self.all_entities = []
        self.acronyms = []
        self.language = language

        if not model:
            assert isinstance(train_sents, Iterable)

        if tagger == "ClassifierBasedTagger":
            self.feature_detector = iob_features
            self.tagger = ClassifierBasedTagger(train=train_sents,
                                                feature_detector=iob_features,
                                                **kwargs)

        elif tagger == "CRFTagger":
            self.set_entities(entities)
            if not model:

                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.train(
                    train_data=train_sents,
                    model_file="../results/{}".format(model_name))
            else:
                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.set_model_file(model)
        else:
            raise Exception('Unknown tagger')
Esempio n. 2
0
def ner_tag(word):
    from nltk.tag import CRFTagger
    ct = CRFTagger()

    import pickle
    infolist = pickle.load(open('infolist.pickle', 'rb'))
    infodict = {}
    posdict = {}
    nerdict = {}

    for [word, postag, nertag] in infolist:
        if word not in posdict:
            posdict[word] = [postag]
        if word in posdict:
            posdict[word].append(postag)

    for [word, postag, nertag] in infolist:
        if word not in nerdict:
            nerdict[word] = [nertag]
        if word in nerdict:
            nerdict[word].append(nertag)

    #print(most_common(posdict["van"]))
    ner_tag = most_common(nerdict[word])
    return ner_tag
Esempio n. 3
0
def tagSentences(path, training_list=[], testing_list=[]):
    ct = CRFTagger()
    train_list = getTrainList(training_list)
    ct.train(train_list, 'model.crf.tagger')
    sentences = getSentences(path, testing_list)
    tagged_sentences = ct.tag_sents(sentences)
    return tagged_sentences
Esempio n. 4
0
def question3():
    tagger = CRFTagger(feature_func=feature_func)

    tagger.train(train_sentences, 'model.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
Esempio n. 5
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old
Esempio n. 6
0
    def __init__(self, data=[]):
        self.tagger = CRFTagger()
        self.tagger.set_model_file('model.crf.tagger')

        if data.count(True) > 0:
            self.data_tagging, self.data_testing = self.for_tagging_testing(
                data)
Esempio n. 7
0
def function_pos_tagging(new_stopwords_tweets):
    ct = CRFTagger()
    ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger')
    new_pos_tweets = []
    for n in range(len(new_stopwords_tweets)):
        pos_tweet_word = [new_stopwords_tweets[n][0]]
        pos_tweet_words = ct.tag_sents(pos_tweet_word)
        pos_tweet = [pos_tweet_words, new_stopwords_tweets[n][1]]
        new_pos_tweets.append(pos_tweet)

    new_features_tweets = []
    for n in range(len(new_pos_tweets)):
        pos_tweets_data = new_pos_tweets[n][0][0]
        features = []
        for tokenTag in pos_tweets_data:
            token, tag = tokenTag
            access = ['NN', 'JJ', 'RB', 'VBD']
            if tag in access:
                features.append(token)
            else:
                pass

        if features:
            features_tweets = [features, new_pos_tweets[n][1]]
            new_features_tweets.append(features_tweets)
        else:
            pass
    return new_features_tweets
Esempio n. 8
0
    def main(self):
        # metode SENDIRI
        file = open("forecast_corpus.txt", "r")
        call = file.read()
        corpus = call.split()
        file.close()
        verba = []

        # stopword removal
        # sfactory = StopWordRemoverFactory()
        # stopwords = sfactory.create_stop_word_remover()
        # stop = stopwords.remove(call)
        # c = stop.split()

        # print("Membaca corpus.....")
        ct = CRFTagger()
        ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
        hasil = ct.tag_sents([corpus])

        this = Verba_finder()
        for x in range(len(hasil[0])):
            if hasil[0][x][1] == 'VB' and this.afiks_check(
                    hasil[0][x][0]) == 1 and (hasil[0][x + 1][1] == 'NN'
                                              or hasil[0][x + 1][1] == 'JJ'):
                # print(hasil[0][x])
                verba.append(" " + hasil[0][x][0] + " ")

        return verba
Esempio n. 9
0
def train_taggers():
    train_sents = load_pkl('train_sents')

    # instantiate taggers
    unigram_tagger = nltk.UnigramTagger(train_sents)
    tnt_tagger = tnt.TnT()
    perceptron_tagger = perceptron.PerceptronTagger(load=False)
    # limit the number of iteractions as the training takes too long
    crf_tagger = CRFTagger(training_opt={'max_iterations': 100})

    print('Unigram tagger has already been trained')
    save_pkl(unigram_tagger, 'unigram-tagger')

    print('training TnT tagger ...', end='', flush=True)
    tnt_tagger.train(train_sents)
    print('Done')
    save_pkl(tnt_tagger, 'tnt-tagger')

    print('training Perceptron tagger ...', end='', flush=True)
    perceptron_tagger.train(train_sents)
    print('Done')
    save_pkl(perceptron_tagger, 'perceptron-tagger')

    print('training CRF tagger ...', end='', flush=True)
    crf_tagger.train(train_sents, 'crf-tagger.model')
    print('Done')
Esempio n. 10
0
def tagpos(request):
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
    tokenize = word_tokenize("Saya bekerja di Bandung")
    hasil = ct.tag_sents([tokenize])
    postag = nltk.pos_tag(tokenize)
    context = {
        'tokenize': tokenize,
        'postag': postag,
        'hasil': hasil,
    }
    template = loader.get_template('polls/tagged.html')
    # train_text = state_union.raw('2005-GWBush.txt')
    # sample_text = state_union.raw('2006-GWBush.txt')
    # custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    # tokenized = custom_sent_tokenizer.tokenize(sample_text)
    # tagged = []
    # for i in tokenized[:5]:
    #     words = nltk.word_tokenize(i)
    #     tagged.append(nltk.pos_tag(words))
    #
    # template = loader.get_template('polls/tagged.html')
    # context = {
    #     'tagged' : tagged
    # }
    return HttpResponse(template.render(context, request))
Esempio n. 11
0
def train_pos_tag(dataset_dir, output_path):
    jumSample = 500000
    namaFile = dataset_dir
    with open(namaFile, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    pasangan = []
    allPasangan = []

    for line in lines[:min(jumSample, len(lines))]:
        # Remove Wiki Tags
        line = re.sub('<[^>]*>', '', line)
        if line == '':
            if len(pasangan) != 0:
                allPasangan.append(pasangan)
            pasangan = []
        else:
            kata, tag = line.split('\t')
            p = (kata, tag)
            pasangan.append(p)

    ct = CRFTagger()
    print("Training Tagger...")
    ct.train(allPasangan, output_path)
    print("Training Complete")
Esempio n. 12
0
def test_taggers():
    # load taggers
    unigram_tagger = load_pkl('unigram-tagger')
    tnt_tagger = load_pkl('tnt-tagger')
    perceptron_tagger = load_pkl('perceptron-tagger')
    # crf_tagger = load_pkl('crf-tagger')
    crf_tagger = CRFTagger()
    crf_tagger.set_model_file('crf-tagger.model')

    test_sents = load_pkl('test_sents')[:10]
    print(f'{len(test_sents)} sentences in testing set')

    taggers = [
        ['Unigram tagger', unigram_tagger, 0, 0],
        ['TnT tagger', tnt_tagger, 0, 0],
        ['Perceptron tagger', perceptron_tagger, 0, 0],
        ['CRF tagger', crf_tagger, 0, 0],
    ]

    for t in taggers:
        print(f'evaluating {t[0]} ... ', end='', flush=True)
        f1 = t[1].evaluate(test_sents)
        t[2] = f1
        # the evaluation result is the same as f1 score calculated by sklearn
        # f1 = cal_f1_score(t[1], test_sents)
        # t[3] = f1
        # f1 = 0
        print(f'done. f1 score: {f1}')

    best_tagger_info = max(taggers, key=lambda t: t[2])
    print('best tagger is ' + best_tagger_info[0])
    best_tagger = best_tagger_info[1]
Esempio n. 13
0
  def __init__(self, train_sents, **kwargs):
    assert isinstance(train_sents, Iterable)
 
    self.feature_detector = features
    self.tagger = CRFTagger(
      feature_func=features
    )
    self.tagger.train(train_sents, 'model.crf.tagger')
Esempio n. 14
0
    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True
def crftagger(hasil_stem):
    result = []
    ct = CRFTagger()
    ct.set_model_file('D://dataset/all_indo_man_tag_corpus_model.crf.tagger')
    for i in hasil_stem:
        hasil = ct.tag_sents([i])
        for j in hasil:
            result.append(j)
    return result
 def __init__(self, iob_predictor):
     self.iob_predictor = iob_predictor
     self.stemmer = StemmerFactory().create_stemmer()
     self.TAGGER3 = CRFTagger()
     self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
     self.label_words = self.read_label_file('label-words.txt')
     self.label_posses = self.read_label_file('label-posses.txt')
     self.label_lemmas = self.read_label_file('label-lemmas.txt')
     self.label_iob_feature = self.read_label_file('label-iob_feature.txt')
     self.label_iob_classes = self.read_label_file('label-iob_classes.txt')
Esempio n. 17
0
def question3():

    tagger = CRFTagger(feature_func=feature_func)
    tagger.train(train_sentences, 'model_windows_size_1.crf.tagger')

    #tagger = CRFTagger(feature_func=feature_func)
    #tagger.set_model_file('model_windows_size_1.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
def Postagging(data):
    postaggedData = []
    postagOnly = []
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
    postaggedData = ct.tag_sents(data)
    for i in range(len(postaggedData)):
        for j in range(len(postaggedData[i])):    
            postagOnly.append(postaggedData[i][j][1])
    return postagOnly
Esempio n. 19
0
 def tag_crf(self, untagged_string: str):
     """Tag POS with CRF tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['crf']
     tagger = CRFTagger()
     tagger.set_model_file(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
Esempio n. 20
0
 def train(self, verbose=True):
     assert self.train_data is not None, 'train_data is required.'
     print('\ttraining ...')
     # transform data
     instance_list = self._transform_data(self.train_data)
     userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder)
     writeUtterTag(instance_list, userUtterTag_train_fname)
     print('\ttrain_data={}'.format(userUtterTag_train_fname))
     # train model
     self.model = CRFTagger(verbose=verbose)
     self.model.train(instance_list, self.model_fname)
     print('\tmodel_fname={}'.format(self.model_fname))
     print('\tsaving model ...')
Esempio n. 21
0
def getPosTag():
    global perLabel, jobLabel, subLabel, orgLabel, geoLabel
    raw_sent = sentInput.get()
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

    tokens = nltk.tokenize.word_tokenize(raw_sent)
    postagged = ct.tag_sents([tokens])

    data = []
    for token in postagged[0]:
        data.append(token + ('O', ))

    tagger_ner = pycrfsuite.Tagger()
    tagger_ner.open('model_ner.crfsuite')
    ner = tagger_ner.tag(sent2features(data, False))

    for i in range(len(ner)):
        data[i] = data[i][0:2] + (ner[i], )

    tagger_oh = pycrfsuite.Tagger()
    tagger_oh.open('model_oh.crfsuite')
    oh = tagger_oh.tag(sent2features(data, True))

    for i in range(len(oh)):
        data[i] += (oh[i], )

    per = []
    job = []
    sub = []
    org = []
    geo = []

    for token in data:
        if token[3] == '1':
            label = token[2][-3:]
            if label == 'PER':
                per.append(token[0])
            elif label == 'ORG':
                org.append(token[0])
            elif label == 'SUB':
                sub.append(token[0])
            elif label == 'JOB':
                job.append(token[0])
            elif label == 'GEO':
                geo.append(token[0])
    perLabel.config(text='PER: ' + (' ').join(per))
    jobLabel.config(text='JOB: ' + (' ').join(job))
    subLabel.config(text='SUB: ' + (' ').join(sub))
    orgLabel.config(text='ORG: ' + (' ').join(org))
    geoLabel.config(text='GEO: ' + (' ').join(geo))
Esempio n. 22
0
def crf_tag():
    news_text = brown.tagged_sents(categories='news')
    train_sents = news_text[:3230]
    test_sents = news_text[3230:4600]
    ct = CRFTagger()
    tagger = ct.train(train_sents, 'model.crf.tagger')
    test = ct.evaluate(test_sents)
    print test
    sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode(
        'utf-8')
    sent_w = sent3.lower().split()
    print sent_w
    tag = ct.tag(sent_w)
    print "The Tag Is:", tag
def pos_tagger(data, attr="paragraphs"):
    flatten = lambda l: [item for sublist in l for item in sublist]
    ct = CRFTagger()
    ct.set_model_file('dataset/all_indo_man_tag_corpus_model.crf.tagger')
    for category in data:
        category['word_tag_{}'.format(attr)] = []
        for paragraph in category[attr]:
            list_tag_kalimat = []
            for kalimat in paragraph:
                tag_kalimat = ct.tag_sents([kalimat])
                tag_kalimat = flatten(tag_kalimat)
                list_tag_kalimat.append(tag_kalimat)
            category['word_tag_{}'.format(attr)].append(list_tag_kalimat)
    return data
Esempio n. 24
0
    def __init__(self):
        # Memuat data pre-trained POS-Tagger
        uni, bi, tri, word = self.load_obj("tagger")
        self.TAGGER1 = Tagger(uni, bi, tri, word)

        # Memuat data pre-trained POS-Tagger
        uni2, bi2, tri2, word2 = self.load_obj("tagger2")
        self.TAGGER2 = Tagger(uni2, bi2, tri2, word2)

        self.TAGGER3 = CRFTagger()
        self.TAGGER3.set_model_file(
            'postagg/dataset/all_indo_man_tag_corpus_model.crf.tagger')

        # Memuat data grammar chunker
        self.load_chunker()
def pos_tagger(text):  # input: teks/String
    # instansiasi
    ct = CRFTagger()

    # load model tagger indonesia
    ct.set_model_file('model_postagging_crf.tagger')

    # cleaning
    text = re.sub('\.?\,?\(?\)?\"?', '', text)
    text = re.sub("\n", " ", text)
    text = text.split(" ")

    # ini fungsi untuk melakukan postagging
    tagged_text = ct.tag_sents([text])

    # hasil
    return tagged_text  # output: teks yang sudah diberi pos_tag
def make_pos_model(model_type):
    """Load selected algorithm, save model to models repo."""
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    elif model_type == 'crf':
        tagger = CRFTagger()
        file = 'crf.pickle'
        _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
        path = os.path.join(_dir, file)
        tagger.train(train_sents, path)
        print('Completed training {0} model in {1} seconds to {2}.'.format(
            model_type,
            time.time() - now, path))
        return
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(
        model_type,
        time.time() - now, path))
Esempio n. 27
0
def getData(filename):
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

    result = []
    annotated = []
    with open(filename + '.csv', 'r') as f:
        reader = csv.reader(f)
        annotated = list(reader)

    sent = []
    sent_gold = []
    sent_oh = []
    curr_sent = ''
    for token in annotated:
        if curr_sent != str(token[1]) + ' ' + str(token[2]):
            hasil = ct.tag_sents([sent])
            mytuple = []
            for idx in range(len(sent)):
                try:
                    mytuple.append(hasil[0][idx] +
                                   (sent_gold[idx], sent_oh[idx]))
                except IndexError:
                    pass
            result.append(mytuple)
            sent = []
            sent_gold = []
            sent_oh = []
            curr_sent = str(token[1]) + ' ' + str(token[2])
        sent.append(token[4])
        sent_gold.append(token[5])
        sent_oh.append(token[6])
    hasil = ct.tag_sents([sent])
    mytuple = []
    for idx in range(len(sent)):
        try:
            mytuple.append(hasil[0][idx] + (sent_gold[idx], sent_oh[idx]))
        except:
            pass
    result.append(mytuple)
    result = result[1:]
    print('Total sentence: ' + str(len(result)))
    random.shuffle(result)
    return result
Esempio n. 28
0
    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.LabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)
Esempio n. 29
0
def main(no_stopwords, use_manual_train_set):

	print "MAINTAIN COMMON WORDS: " + str(not no_stopwords)
	print "USING HAND LABELED TRAIN DATA: " + str(use_manual_train_set)

	full_set = get_domain_set(no_stopwords)
	if not no_stopwords:
		full_set.extend(get_other_set())

	train_set, test_set_auto = divide_sets(full_set, 0.75)
	set_manual = get_manual_set(no_stopwords)

	train_set_manual = []
	test_set_manual = []
	if use_manual_train_set:
		train_set_manual, test_set_manual = divide_sets(set_manual, 0.28)
		train_set.extend(train_set_manual)
	else:
		test_set_manual = set_manual

	tagger = CRFTagger(feature_func=feature_extraction)
	try:
		tagger.train(train_set, 'laptop.crf.tagger')
	except ValueError:
		fi = open('DEBUG', 'w')
		for li in DEBUG:
			fi.write(str(li.encode('utf-8')) + '\n')
		fi.close()

	print "AUTOMATIC LABELED TEST"
	tagged_sents_auto = tagger.tag_sents(map_test_set(test_set_auto, word=True))
	predicted_auto = create_vector_of_predicted_labels(tagged_sents_auto)
	golden_auto = create_vector_of_predicted_labels(test_set_auto)

	print calculate_micro_accuracy(predicted_auto, golden_auto, no_stopwords)

	print "MANUAL LABELED TEST"
	tagged_sents_manual = tagger.tag_sents(map_test_set(test_set_manual, word=True))
	predicted_manual = create_vector_of_predicted_labels(tagged_sents_manual)
	golden_manual = create_vector_of_predicted_labels(test_set_manual)
	
	print calculate_micro_accuracy(predicted_manual, golden_manual, no_stopwords)
	print ""
Esempio n. 30
0
def train_tagger(language, model_type, feature, train_sents):
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
    elif model_type == 'crf':
        tagger = CRFTagger()
        tagger.train(train_sents,
                     'taggers/{0}/{1}/crf.pickle'.format(language, feature))
    elif model_type == 'perceptron':
        tagger = PerceptronTagger(load=False)
        tagger.train(train_sents)

    return tagger