Beispiel #1
0
def tagSentences(path, training_list=[], testing_list=[]):
    ct = CRFTagger()
    train_list = getTrainList(training_list)
    ct.train(train_list, 'model.crf.tagger')
    sentences = getSentences(path, testing_list)
    tagged_sentences = ct.tag_sents(sentences)
    return tagged_sentences
Beispiel #2
0
def test_taggers():
    # load taggers
    unigram_tagger = load_pkl('unigram-tagger')
    tnt_tagger = load_pkl('tnt-tagger')
    perceptron_tagger = load_pkl('perceptron-tagger')
    # crf_tagger = load_pkl('crf-tagger')
    crf_tagger = CRFTagger()
    crf_tagger.set_model_file('crf-tagger.model')

    test_sents = load_pkl('test_sents')[:10]
    print(f'{len(test_sents)} sentences in testing set')

    taggers = [
        ['Unigram tagger', unigram_tagger, 0, 0],
        ['TnT tagger', tnt_tagger, 0, 0],
        ['Perceptron tagger', perceptron_tagger, 0, 0],
        ['CRF tagger', crf_tagger, 0, 0],
    ]

    for t in taggers:
        print(f'evaluating {t[0]} ... ', end='', flush=True)
        f1 = t[1].evaluate(test_sents)
        t[2] = f1
        # the evaluation result is the same as f1 score calculated by sklearn
        # f1 = cal_f1_score(t[1], test_sents)
        # t[3] = f1
        # f1 = 0
        print(f'done. f1 score: {f1}')

    best_tagger_info = max(taggers, key=lambda t: t[2])
    print('best tagger is ' + best_tagger_info[0])
    best_tagger = best_tagger_info[1]
Beispiel #3
0
def train_pos_tag(dataset_dir, output_path):
    jumSample = 500000
    namaFile = dataset_dir
    with open(namaFile, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    pasangan = []
    allPasangan = []

    for line in lines[:min(jumSample, len(lines))]:
        # Remove Wiki Tags
        line = re.sub('<[^>]*>', '', line)
        if line == '':
            if len(pasangan) != 0:
                allPasangan.append(pasangan)
            pasangan = []
        else:
            kata, tag = line.split('\t')
            p = (kata, tag)
            pasangan.append(p)

    ct = CRFTagger()
    print("Training Tagger...")
    ct.train(allPasangan, output_path)
    print("Training Complete")
Beispiel #4
0
class CRF:
    def __init__(self):
        self.__model = type('test', (object,), {})()
        pass

    def train(self, X_training_data):
        self.__model = CRFTagger()
        self.__model.train(X_training_data, 'crf.model')
        pass

    def test(self, X_test_data):

        total = 0
        correct = 0
        for kalimat in X_test_data:
            temp = []
            for word in kalimat:
                temp.append(word[0])

            if len(temp) != 0:
                predicted_y = self.__model.tag(temp)
                for i in range(len(predicted_y)):
                    total += 1
                    if predicted_y[i][1] == kalimat[i][1]:
                        correct += 1

        print(correct, total)
        print(correct / total)
    pass
Beispiel #5
0
def question3():
    tagger = CRFTagger(feature_func=feature_func)

    tagger.train(train_sentences, 'model.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
Beispiel #6
0
def train_taggers():
    train_sents = load_pkl('train_sents')

    # instantiate taggers
    unigram_tagger = nltk.UnigramTagger(train_sents)
    tnt_tagger = tnt.TnT()
    perceptron_tagger = perceptron.PerceptronTagger(load=False)
    # limit the number of iteractions as the training takes too long
    crf_tagger = CRFTagger(training_opt={'max_iterations': 100})

    print('Unigram tagger has already been trained')
    save_pkl(unigram_tagger, 'unigram-tagger')

    print('training TnT tagger ...', end='', flush=True)
    tnt_tagger.train(train_sents)
    print('Done')
    save_pkl(tnt_tagger, 'tnt-tagger')

    print('training Perceptron tagger ...', end='', flush=True)
    perceptron_tagger.train(train_sents)
    print('Done')
    save_pkl(perceptron_tagger, 'perceptron-tagger')

    print('training CRF tagger ...', end='', flush=True)
    crf_tagger.train(train_sents, 'crf-tagger.model')
    print('Done')
Beispiel #7
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old
Beispiel #8
0
class NamedEntityChunker(ChunkParserI):
  def __init__(self, train_sents, **kwargs):
    assert isinstance(train_sents, Iterable)
 
    self.feature_detector = features
    self.tagger = CRFTagger(
      feature_func=features
    )
    self.tagger.train(train_sents, 'model.crf.tagger')

    # self.tagger = ClassifierBasedTagger(
    #   train=train_sents,
    #   feature_detector=features,
    #   **kwargs)
 
  def parse(self, tagged_sent):
    chunks = self.tagger.tag(tagged_sent)
 
    # Transform the result from [((w1, t1), iob1), ...] 
    # to the preferred list of triplets format [(w1, t1, iob1), ...]
    iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
    # iob_triplets = [(w, t, 'O') for ((w, t), c) in chunks]
 
    # Transform the list of triplets to nltk.Tree format
    return conlltags2tree(iob_triplets)
Beispiel #9
0
    def __init__(self,
                 train_sents=None,
                 tagger="ClassifierBasedTagger",
                 model=None,
                 model_name="../results/modelCRF_featured",
                 entities=None,
                 language="english",
                 **kwargs):

        self.all_entities = []
        self.acronyms = []
        self.language = language

        if not model:
            assert isinstance(train_sents, Iterable)

        if tagger == "ClassifierBasedTagger":
            self.feature_detector = iob_features
            self.tagger = ClassifierBasedTagger(train=train_sents,
                                                feature_detector=iob_features,
                                                **kwargs)

        elif tagger == "CRFTagger":
            self.set_entities(entities)
            if not model:

                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.train(
                    train_data=train_sents,
                    model_file="../results/{}".format(model_name))
            else:
                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.set_model_file(model)
        else:
            raise Exception('Unknown tagger')
Beispiel #10
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)
    
    stdout_old = sys.stdout
    
    sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w')  
    
    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    
    sys.stdout.flush()
    
    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)
    
    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)
    
    sys.stdout = stdout_old
Beispiel #11
0
    def __init__(self, data=[]):
        self.tagger = CRFTagger()
        self.tagger.set_model_file('model.crf.tagger')

        if data.count(True) > 0:
            self.data_tagging, self.data_testing = self.for_tagging_testing(
                data)
Beispiel #12
0
def tagpos(request):
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
    tokenize = word_tokenize("Saya bekerja di Bandung")
    hasil = ct.tag_sents([tokenize])
    postag = nltk.pos_tag(tokenize)
    context = {
        'tokenize': tokenize,
        'postag': postag,
        'hasil': hasil,
    }
    template = loader.get_template('polls/tagged.html')
    # train_text = state_union.raw('2005-GWBush.txt')
    # sample_text = state_union.raw('2006-GWBush.txt')
    # custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    # tokenized = custom_sent_tokenizer.tokenize(sample_text)
    # tagged = []
    # for i in tokenized[:5]:
    #     words = nltk.word_tokenize(i)
    #     tagged.append(nltk.pos_tag(words))
    #
    # template = loader.get_template('polls/tagged.html')
    # context = {
    #     'tagged' : tagged
    # }
    return HttpResponse(template.render(context, request))
Beispiel #13
0
def function_pos_tagging(new_stopwords_tweets):
    ct = CRFTagger()
    ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger')
    new_pos_tweets = []
    for n in range(len(new_stopwords_tweets)):
        pos_tweet_word = [new_stopwords_tweets[n][0]]
        pos_tweet_words = ct.tag_sents(pos_tweet_word)
        pos_tweet = [pos_tweet_words, new_stopwords_tweets[n][1]]
        new_pos_tweets.append(pos_tweet)

    new_features_tweets = []
    for n in range(len(new_pos_tweets)):
        pos_tweets_data = new_pos_tweets[n][0][0]
        features = []
        for tokenTag in pos_tweets_data:
            token, tag = tokenTag
            access = ['NN', 'JJ', 'RB', 'VBD']
            if tag in access:
                features.append(token)
            else:
                pass

        if features:
            features_tweets = [features, new_pos_tweets[n][1]]
            new_features_tweets.append(features_tweets)
        else:
            pass
    return new_features_tweets
Beispiel #14
0
    def main(self):
        # metode SENDIRI
        file = open("forecast_corpus.txt", "r")
        call = file.read()
        corpus = call.split()
        file.close()
        verba = []

        # stopword removal
        # sfactory = StopWordRemoverFactory()
        # stopwords = sfactory.create_stop_word_remover()
        # stop = stopwords.remove(call)
        # c = stop.split()

        # print("Membaca corpus.....")
        ct = CRFTagger()
        ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
        hasil = ct.tag_sents([corpus])

        this = Verba_finder()
        for x in range(len(hasil[0])):
            if hasil[0][x][1] == 'VB' and this.afiks_check(
                    hasil[0][x][0]) == 1 and (hasil[0][x + 1][1] == 'NN'
                                              or hasil[0][x + 1][1] == 'JJ'):
                # print(hasil[0][x])
                verba.append(" " + hasil[0][x][0] + " ")

        return verba
Beispiel #15
0
class SlotTaggingModel(object):

    def __init__(self, **argparams):
        self.train_data = argparams['train_data']
        if self.train_data is not None:
            assert isinstance(self.train_data, DataSetCSVagentActPred)
        self.model_folder = argparams['model_folder']
        self.model_fname = '{}/slotTagging.model'.format(self.model_folder)

    def train(self, verbose=True):
        assert self.train_data is not None, 'train_data is required.'
        print('\ttraining ...')
        # transform data
        instance_list = self._transform_data(self.train_data)
        userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder)
        writeUtterTag(instance_list, userUtterTag_train_fname)
        print('\ttrain_data={}'.format(userUtterTag_train_fname))
        # train model
        self.model = CRFTagger(verbose=verbose)
        self.model.train(instance_list, self.model_fname)
        print('\tmodel_fname={}'.format(self.model_fname))
        print('\tsaving model ...')

    def _transform_data(self, data):
        ''' convert textual utter and user tags into a list of lists that contain lists of (w, t) pairs
        '''
        userUtter_txt = data.userUtter_txt
        userTag_txt = data.userTag_txt
        instance_list = list()
        for words, tags in zip(userUtter_txt, userTag_txt):
            instance = [(word.strip(), tag.strip()) for word, tag in zip(words.decode('utf-8').strip().split(), tags.decode('utf-8').strip().split())]
            instance_list.append(instance)
        return instance_list

    def predict(self, test_data):
        '''return a list of lists, [[(w1, tag1), (w2, tag2), (w3, tag3)], [...], [...]]
        '''
        assert test_data is not None, 'test_data is required.'
        assert isinstance(test_data, DataSetCSVagentActPred)
        print('\tpredicting Slot Tags ...')
        # transform data
        instance_list = self._transform_data(test_data)
        userUtterTag_test_fname = '{}/userUtterTag_test.target'.format(self.model_folder)
        writeUtterTag(instance_list, userUtterTag_test_fname)
        print('\ttag_target={}'.format(userUtterTag_test_fname))
        instance_utter_list = getUtterList(instance_list)
        # testing
        results = self.model.tag_sents(instance_utter_list)
        self.result_fname = '{}/userUtterTag_test.pred'.format(self.model_folder)
        print('\ttag_pred={}'.format(self.result_fname))
        writeUtterTag(results, self.result_fname)
        precision, recall, fscore, accuracy_frame = eval_tagPredBaseline(instance_list, results, test_data.userTag2id, test_data.userTag_vocab_size)
        print('\tprecision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}'.format(precision, recall, fscore, accuracy_frame))
        return results

    def load_model(self, verbose=True):
        print('\tloading model ...')
        self.model = CRFTagger(verbose=verbose)
        self.model.set_model_file(self.model_fname)
Beispiel #16
0
    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True
Beispiel #17
0
  def __init__(self, train_sents, **kwargs):
    assert isinstance(train_sents, Iterable)
 
    self.feature_detector = features
    self.tagger = CRFTagger(
      feature_func=features
    )
    self.tagger.train(train_sents, 'model.crf.tagger')
def crftagger(hasil_stem):
    result = []
    ct = CRFTagger()
    ct.set_model_file('D://dataset/all_indo_man_tag_corpus_model.crf.tagger')
    for i in hasil_stem:
        hasil = ct.tag_sents([i])
        for j in hasil:
            result.append(j)
    return result
def Postagging(data):
    postaggedData = []
    postagOnly = []
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
    postaggedData = ct.tag_sents(data)
    for i in range(len(postaggedData)):
        for j in range(len(postaggedData[i])):    
            postagOnly.append(postaggedData[i][j][1])
    return postagOnly
 def __init__(self, iob_predictor):
     self.iob_predictor = iob_predictor
     self.stemmer = StemmerFactory().create_stemmer()
     self.TAGGER3 = CRFTagger()
     self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
     self.label_words = self.read_label_file('label-words.txt')
     self.label_posses = self.read_label_file('label-posses.txt')
     self.label_lemmas = self.read_label_file('label-lemmas.txt')
     self.label_iob_feature = self.read_label_file('label-iob_feature.txt')
     self.label_iob_classes = self.read_label_file('label-iob_classes.txt')
Beispiel #21
0
def question3():

    tagger = CRFTagger(feature_func=feature_func)
    tagger.train(train_sentences, 'model_windows_size_1.crf.tagger')

    #tagger = CRFTagger(feature_func=feature_func)
    #tagger.set_model_file('model_windows_size_1.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
Beispiel #22
0
 def tag_crf(self, untagged_string: str):
     """Tag POS with CRF tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['crf']
     tagger = CRFTagger()
     tagger.set_model_file(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
Beispiel #23
0
 def tag_crf(self, untagged_string: str):
     """Tag POS with CRF tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['crf']
     tagger = CRFTagger()
     tagger.set_model_file(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
Beispiel #24
0
 def train(self, verbose=True):
     assert self.train_data is not None, 'train_data is required.'
     print('\ttraining ...')
     # transform data
     instance_list = self._transform_data(self.train_data)
     userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder)
     writeUtterTag(instance_list, userUtterTag_train_fname)
     print('\ttrain_data={}'.format(userUtterTag_train_fname))
     # train model
     self.model = CRFTagger(verbose=verbose)
     self.model.train(instance_list, self.model_fname)
     print('\tmodel_fname={}'.format(self.model_fname))
     print('\tsaving model ...')
Beispiel #25
0
def getPosTag():
    global perLabel, jobLabel, subLabel, orgLabel, geoLabel
    raw_sent = sentInput.get()
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

    tokens = nltk.tokenize.word_tokenize(raw_sent)
    postagged = ct.tag_sents([tokens])

    data = []
    for token in postagged[0]:
        data.append(token + ('O', ))

    tagger_ner = pycrfsuite.Tagger()
    tagger_ner.open('model_ner.crfsuite')
    ner = tagger_ner.tag(sent2features(data, False))

    for i in range(len(ner)):
        data[i] = data[i][0:2] + (ner[i], )

    tagger_oh = pycrfsuite.Tagger()
    tagger_oh.open('model_oh.crfsuite')
    oh = tagger_oh.tag(sent2features(data, True))

    for i in range(len(oh)):
        data[i] += (oh[i], )

    per = []
    job = []
    sub = []
    org = []
    geo = []

    for token in data:
        if token[3] == '1':
            label = token[2][-3:]
            if label == 'PER':
                per.append(token[0])
            elif label == 'ORG':
                org.append(token[0])
            elif label == 'SUB':
                sub.append(token[0])
            elif label == 'JOB':
                job.append(token[0])
            elif label == 'GEO':
                geo.append(token[0])
    perLabel.config(text='PER: ' + (' ').join(per))
    jobLabel.config(text='JOB: ' + (' ').join(job))
    subLabel.config(text='SUB: ' + (' ').join(sub))
    orgLabel.config(text='ORG: ' + (' ').join(org))
    geoLabel.config(text='GEO: ' + (' ').join(geo))
Beispiel #26
0
def crf_tag():
    news_text = brown.tagged_sents(categories='news')
    train_sents = news_text[:3230]
    test_sents = news_text[3230:4600]
    ct = CRFTagger()
    tagger = ct.train(train_sents, 'model.crf.tagger')
    test = ct.evaluate(test_sents)
    print test
    sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode(
        'utf-8')
    sent_w = sent3.lower().split()
    print sent_w
    tag = ct.tag(sent_w)
    print "The Tag Is:", tag
def pos_tagger(data, attr="paragraphs"):
    flatten = lambda l: [item for sublist in l for item in sublist]
    ct = CRFTagger()
    ct.set_model_file('dataset/all_indo_man_tag_corpus_model.crf.tagger')
    for category in data:
        category['word_tag_{}'.format(attr)] = []
        for paragraph in category[attr]:
            list_tag_kalimat = []
            for kalimat in paragraph:
                tag_kalimat = ct.tag_sents([kalimat])
                tag_kalimat = flatten(tag_kalimat)
                list_tag_kalimat.append(tag_kalimat)
            category['word_tag_{}'.format(attr)].append(list_tag_kalimat)
    return data
Beispiel #28
0
    def __init__(self):
        # Memuat data pre-trained POS-Tagger
        uni, bi, tri, word = self.load_obj("tagger")
        self.TAGGER1 = Tagger(uni, bi, tri, word)

        # Memuat data pre-trained POS-Tagger
        uni2, bi2, tri2, word2 = self.load_obj("tagger2")
        self.TAGGER2 = Tagger(uni2, bi2, tri2, word2)

        self.TAGGER3 = CRFTagger()
        self.TAGGER3.set_model_file(
            'postagg/dataset/all_indo_man_tag_corpus_model.crf.tagger')

        # Memuat data grammar chunker
        self.load_chunker()
Beispiel #29
0
def ner_tag(word):
    from nltk.tag import CRFTagger
    ct = CRFTagger()

    import pickle
    infolist = pickle.load(open('infolist.pickle', 'rb'))
    infodict = {}
    posdict = {}
    nerdict = {}

    for [word, postag, nertag] in infolist:
        if word not in posdict:
            posdict[word] = [postag]
        if word in posdict:
            posdict[word].append(postag)

    for [word, postag, nertag] in infolist:
        if word not in nerdict:
            nerdict[word] = [nertag]
        if word in nerdict:
            nerdict[word].append(nertag)

    #print(most_common(posdict["van"]))
    ner_tag = most_common(nerdict[word])
    return ner_tag
Beispiel #30
0
    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True
def pos_tagger(text):  # input: teks/String
    # instansiasi
    ct = CRFTagger()

    # load model tagger indonesia
    ct.set_model_file('model_postagging_crf.tagger')

    # cleaning
    text = re.sub('\.?\,?\(?\)?\"?', '', text)
    text = re.sub("\n", " ", text)
    text = text.split(" ")

    # ini fungsi untuk melakukan postagging
    tagged_text = ct.tag_sents([text])

    # hasil
    return tagged_text  # output: teks yang sudah diberi pos_tag
Beispiel #32
0
def chunking(sents, chunked_file):
    '''
    Chunking
    param sents: 列表,如[['dog', 'is', 'dog'], ['dog', 'good']]
    '''
	
    os.chdir('/home/zqr/code/chunk2vec/')

    start_time = time.time()
    #PoS
    print '\n-->Start PoS'
    #print '->Training PoS Tagger'
    #ct = CRFTagger()
    #ct.train(chunk_traindata(pos_trainfile), 'model.crf.tagger')
    #print '->Done'
    
    #pos_testdata_gold = chunk_traindata(pos_testfile)
    
    # pos corpus
    print '->Load CRF Tagger model'
    ct = CRFTagger()
    ###这个model是从chunk任务中学习到的PoS标签
    ct.set_model_file('model.crf.tagger')
    print '->Posing'
    tagged_sents = ct.tag_sents(sents)
    #print 'PoS acc.:', ct.evaluate(pos_testdata_gold)
    #将PoS好的句子写文件
    print '->Write posed file'
    pos_data(tagged_sents, 'tmp_for_chunking')
    end_time = time.time()
    print '-->Done, Time:', end_time - start_time, 's'
    #节省时间,暂时用测试语料
    #pos_data(pos_testdata_gold, chunk_inputfile)
        
    start_time = time.time()
    ###Chunk,依赖系统安装YamCha,训练语料就用CoNLL的训练语料
    print '\n-->Start Chunking'
    os.system('yamcha-config --libexecdir')
    #os.chdir('/home/zqr/code/sent2vec/')
    os.system('cp /home/zqr/local/libexec/yamcha/Makefile .')
    #训练chunking模型
    #os.system('make CORPUS=' + pos_trainfile +' MODEL=chunk_model train')
    os.system('yamcha -m chunk_model.model < tmp_for_chunking > ' + chunked_file)
    print '-->Done, Time:', time.time() - start_time, 's'
Beispiel #33
0
    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.LabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)
def ExtractItemsFromJudgment(text,CodeTaggerFile,TitleTaggerFile):
        text = removeHTMLTags(text)
        tokenList = tokenizeTestData(text) 
        CodesTagger = CRFTagger()
        TitleTagger = CRFTagger()
        
        CodesTagger.set_model_file(CodeTaggerFile)
        TitleTagger.set_model_file(TitleTaggerFile)
      
        taggedCodes =  CodesTagger.tag_sents(tokenList)
        taggedTitles = TitleTagger.tag_sents(tokenList)
        
        return extract_entities(taggedCodes,taggedTitles)
def load(training, testing):
    ct = CRFTagger()
    # split the training into sentences
    t = "\n".join(training)
    sents = t.split("###/###")
    # split the sentences into tokens
    train = []
    for sent in sents:
        if sent:
            new = []
            words = sent.split("\n")
            for word in words:
                if word:
                    # split the tokens into word and tag
                    new.append(tuple(word.split("/")))
            train.append(new)
    # remove any blank sentences that have been added
    for t in train:
        if not t:
            train.remove(t)
    ct.train(train, 'model.crf.tagger')
    # test on the testing data
    s = "\n".join(testing)
    s_sents = s.split("###/###")
    test = []
    sent_tags = []
    for t in s_sents:
        if t:
            new = []
            right_tags = []
            words = t.split("\n")
            for word in words:
                if word:
                    # split the tokens into just words
                    new.append(word.split("/")[0])
                    # save the tags in a list to be used later
                    right_tags.append(word.split("/")[1])
            sent_tags.append(right_tags)
            test.append(new)
    tags = ct.tag_sents(test)
    return tags, sent_tags
Beispiel #36
0
def run_crf(trainfile, testfile, model_file=None):

    maxlen = 100
    sents_train, tags_train, unique_words_train, unique_tags_train = \
        P.retrieve_sentences_tags(trainfile, maxlen=maxlen)
    sents_test, tags_test, unique_word_test, unique_tags_test = \
        P.retrieve_sentences_tags(testfile, maxlen=maxlen, allowedtags=unique_tags_train)

    train_data = []
    for n, st in enumerate(sents_train):
        s = []
        for m, _ in enumerate(st):
            s.append((unicode(sents_train[n][m], "utf-8")
                      , unicode(tags_train[n][m], "utf-8")))
        train_data.append(s)

    crf = CRFTagger()
    if model_file is None:
        crf.train(train_data, model_file='data/crf.mdl')
    else:
        crf.set_model_file(model_file)

    test_data = []
    for n, st in enumerate(sents_test):
        s = []
        for m, _ in enumerate(st):
            s.append((unicode(sents_test[n][m], "utf-8")
                      , unicode(tags_test[n][m], "utf-8")))
        test_data.append(s)

    print(crf.evaluate(test_data))
Beispiel #37
0
def ExtractItemsFromJudgment(text):

        text = removeHTMLTags(text)
        
        tokenList = tokenizeTestData(text) 
        CodesTagger = CRFTagger()
        titleTagger = CRFTagger()
        
        CodesTagger.set_model_file("models/CRF-Model-OnlyCodes")
        titleTagger.set_model_file("models/CRF-Model-OnlyTitles")
      
        taggedCodes =  CodesTagger.tag_sents(tokenList)
        taggedTitles = titleTagger.tag_sents(tokenList)
       
        return extract_entities(taggedCodes,taggedTitles)
Beispiel #38
0
    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.MultiLabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)
Beispiel #39
0
from nltk.corpus import treebank
from nltk.tag import tnt, CRFTagger


# split training data from test data
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]

# train a trigram N tagger (TnT)
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
print tnt_pos_tagger.evaluate(test_data)

# train a CRF tagger
crf_tagger = CRFTagger()
crf_tagger.train(train_data,
                 '~/Documents/NLP/NLP/crf_model.txt')
print crf_tagger.evaluate(test_data)
Beispiel #40
0
def create_trainingModel(train_data,ModelPath):
    if os.path.isfile(ModelPath):
        os.remove(ModelPath)
    ct = CRFTagger()
    ct.train(train_data,ModelPath)
Beispiel #41
0
class SimpleSLU:
    def __init__(self):
        self.__semantic_instance_list = []
        self.__speech_act_instance_list = []

        self.__semantic_model = None
        self.__speech_act_model = None

        self.__speech_act_lb = None

    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True

    def add_instance(self, utter, speech_act, semantic_tagged):
        tokenized = self.__tokenize(utter, semantic_tagged)
        if tokenized is None:
            return False

        semantic_instance = []
        for word, (bio, tag, attrs) in tokenized:
            if bio is None:
                sem_label = 'O'
            else:
                cat = None
                for attr, val in attrs:
                    if attr == 'cat':
                        cat = val
                sem_label = '%s-%s_%s' % (bio, tag, cat)
            semantic_instance.append((unicode(word.lower()), unicode(sem_label)))
        self.__semantic_instance_list.append(semantic_instance)

        sa_label_list = []
        for sa in speech_act:
            sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list += sa_labels

        sa_label_list = sorted(set(sa_label_list))

        word_feats = ' '.join([word.lower() for word, _ in tokenized])
        self.__speech_act_instance_list.append((word_feats, sa_label_list))

        return True

    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.MultiLabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)

    def pred(self, utter):
        tokenized = self.__tokenize(utter)
        word_feats = ' '.join([word.lower() for word, _ in tokenized])

        pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats]))
        pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized])

        return (pred_act, pred_semantic)

    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
        for word in wordsPerLine :
            requiredFormat.append(word)
    #print "\nrequiredFormat = ",requiredFormat
    return requiredFormat


print "\nReading training corpus...."
ListOfSentences_Training = corpusRead(Training_Data)
print "Reading test corpus...."  
ListOfSentences_Test = corpusRead(Test_Data)




#CRF Training
ct = CRFTagger()
print "CRF Training starts..."
ct.train(ListOfSentences_Training,'model.crf.tagger')
print "CRF Training is done."

print "Testing starts"
print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100
#Tagging by CRF Tagger
ch = 'y'
while (ch != 'n'):
    text = raw_input("Enter the text to be tagged : \n")
    text = converter(text)
    print ct.tag_sents(text)
    print "\nDo you want to continue ?"
    ch = raw_input()
 
#with codecs.open("/Users/Preethi/nlp_project/EMNLP/mandarin_english/training/mandarin-english-training.txt","r","utf-8") as f:
    line=f.readline()
    line_list=[]
    while line:
        #print(line)
        words=line.replace("\r","").replace("\n","").split("\t")
        #print(words)
        if(len(words)<2):
            train_data.append(line_list)
            line_list=[]
        else:
            tup1=(words[0],words[1])
            line_list.append(tup1)
        line=f.readline()
    f.close()
ct = CRFTagger()

ct.train(train_data,'model.crf.tagger')


test_actual=[]
test_sentences=[]
#with codecs.open("nepali-english-demo-20%training-data.txt","r","utf-8") as f:
with codecs.open("/Users/Preethi/nlp_project/EMNLP/spanish_english/training/spanish-english-training-20%.txt","r","utf-8") as f:
#with codecs.open("/Users/Preethi/nlp_project/EMNLP/mandarin_english/training/mandarin-english-testing-answers.txt","r","utf-8") as f:
    line=f.readline()
    test=[]
    sentence=[]
    while line:
        words=line.replace("\r","").replace("\n","").split("\t")
        #print(words)
Beispiel #44
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make crf tagger
        crf_tagger = CRFTagger()
        crf_tagger.train(train_sents, 'model.crf.tagger')
        
        # evaluate crf tagger
        crf_accuracy = None
        crf_accuracy = crf_tagger.evaluate(test_sents)
        crf_accuracies.append(crf_accuracy)
        print('crf:', crf_accuracy)

        #if counter> 0: break
        
    final_accuracies_list = []
    mean_accuracy_crf = mean(crf_accuracies)
    standard_deviation_crf = stdev(crf_accuracies)
    uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}}
    final_accuracies_list.append(uni)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict