Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description='Part-of-Speech Tagging.')
    parser.add_argument(
        '--prefix',
        '-p',
        type=str,
        default='',
        help='specify prefix of files which will be used to store model')
    parser.add_argument('--times',
                        '-t',
                        type=int,
                        default=1,
                        help='specify iteration times')
    parser.add_argument(
        '--all',
        '-a',
        action='store_true',
        help='without this switch, model will be trained by random sampled data'
    )
    parser.add_argument('--file',
                        '-f',
                        type=str,
                        default='',
                        help='specify test data file')
    parser.add_argument('--save',
                        '-s',
                        action='store_true',
                        help='enable this to save model file')
    args = parser.parse_args()
    tagger = Tagger('data/wsj00-18.pos', args.times, not args.all, args.save,
                    args.prefix)
    test_data = Processor(args.file)
    tagger.benchmark(test_data)
Ejemplo n.º 2
0
def main_tag(featureSet, options):
    labelCounter, featCounter = BookKeeper(), BookKeeper()
    labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName))
    featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName))
    optionsDict = vars(options)
    optionsDict['labelCounter'] = labelCounter
    optionsDict['featCounter'] = featCounter
    optionsDict['modelFile'] = '{0}.model'.format(options.modelName)
    tagger = Tagger(featureSet, optionsDict)
    if options.inFeatFile:
        tagger_func = lambda: tagger.tag_features(options.inFeatFile)
        writer_func = lambda s, c: writeSentence(s, comment=c)
    elif options.input_dir:
        assert isdir(options.input_dir), "--input-dir must be a directory"
        out_dir = "{}_out".format(options.input_dir)
        os.mkdir(out_dir)
        tagger_func = lambda: tagger.tag_dir(options.input_dir)
        writer_func = lambda s, c: writeSentence(
            s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a'))
    else:
        tagger_func = lambda: tagger.tag_corp(sys.stdin)
        writer_func = lambda s, c: writeSentence(s, comment=c)

    for sen, other in tagger_func():
        writer_func(sen, other)
Ejemplo n.º 3
0
def main_tag(featureSet, options):
    labelCounter, featCounter = BookKeeper(), BookKeeper()
    labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName))
    featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName))
    optionsDict = vars(options)
    optionsDict['labelCounter'] = labelCounter
    optionsDict['featCounter'] = featCounter
    optionsDict['modelFile'] = '{0}.model'.format(options.modelName)
    tagger = Tagger(featureSet, optionsDict)
    if options.inFeatFile:
        tagger_func = lambda: tagger.tag_features(options.inFeatFile)
        writer_func = lambda s, c: writeSentence(s, comment=c)
    elif options.input_dir:
        assert isdir(options.input_dir), "--input-dir must be a directory"
        out_dir = "{}_out".format(options.input_dir)
        os.mkdir(out_dir)
        tagger_func = lambda: tagger.tag_dir(options.input_dir)
        writer_func = lambda s, c: writeSentence(
            s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a'))
    else:
        tagger_func = lambda: tagger.tag_corp(sys.stdin)
        writer_func = lambda s, c: writeSentence(s, comment=c)

    for sen, other in tagger_func():
        writer_func(sen, other)
Ejemplo n.º 4
0
def evaluation(hp, fn_model, data):
    tagger = Tagger(vocabs=hp['VOCAB'], params=fn_model, hp=hp['HYPERPARAMS'])

    gold = open(hp['TMP_GOLD'], 'w')
    pred = open(hp['TMP_PRED'], 'w')
    indice = [i for i in range(len(data.ws_data))]
    for i in indice:
        words = data.words[i]
        pids = data.pos_data[i][1]
        postags = [tagger.id2pos[pid] for pid in pids]

        for w, p in zip(words, postags):
            gold.write(w + '\t' + p + '\n')
        gold.write('EOS\n')

        output = tagger.tagging(''.join(words))
        sys_words = output.words
        sys_postags = output.postags

        for w, p in zip(sys_words, sys_postags):
            pred.write(w + '\t' + p + '\n')
        pred.write('EOS\n')

    ws_f, pos_f = mecab_eval(hp['TMP_PRED'], hp['TMP_GOLD'])
    return ws_f, pos_f
Ejemplo n.º 5
0
def pos():
    post_data = request.json["text"]

    text_tagger = Tagger()
    response = text_tagger.run(post_data)

    return json.dumps(response)
Ejemplo n.º 6
0
 def __init__(self, n=3, glm=False):
     Reader.__init__(self, n=n)
     Tagger.__init__(self, glm=glm)
     self.tags = set()
     self.tokens = set()
     self.emission_params = {}
     self.transition_params = {}
Ejemplo n.º 7
0
def _evaluation(hp, fn_model, data):
    tagger = Tagger(vocabs=hp['VOCAB'], params=fn_model, hp=hp['HYPERPARAMS'])

    def data_for_eval(words, postags):
        sent = []
        for w, p in zip(words, postags):
            p = w + "\t" + p
            if mecab_system_eval.PY_3 is True:
                w = w.encode("UTF-8")
                p = p.encode("UTF-8")
            sent.append([w, p])
        return sent

    sys_data = []
    ans_data = []
    indice = [i for i in range(len(data.ws_data))]
    for i in indice:
        words = data.words[i]
        pids = data.pos_data[i][1]
        postags = [tagger.id2pos[pid] for pid in pids]
        ans_data.append(data_for_eval(words, postags))

        output = tagger.tagging(''.join(words))
        sys_words = output.words
        sys_postags = output.postags
        sys_data.append(data_for_eval(sys_words, sys_postags))

    r = mecab_system_eval.mecab_eval(sys_data, ans_data)
    _, _, ws_f, _, _, pos_f = mecab_system_eval.calculate_fvalues(r)
    return ws_f, pos_f
Ejemplo n.º 8
0
def diversity_sampling(feature_now, model_ver, budget):
    csvfile = 'records_us/'+model_ver+'.csv'
    model_selected = []

    for i in feature_now:
        with tf.name_scope(model_ver+'/feature_{0}'.format(i)):
            model = Tagger(model_file=model_ver+'/feature_{0}'.format(i),
                                n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i)
        model.train([],[],feature_number=i)
        model_selected.append(model)
        
    train_data, test_data = data_generation(model_selected, feature_now)
    train_x_all = train_data[0]
    train_y_all = train_data[1]
    test_x_all = test_data[0]
    test_y_all = test_data[1]
    episode = 1
    print(">>>>>> Playing game ..")
    while episode <= MAX_EPISODE:
        sample_N = min(budget*4,len(train_y_all))
        
        N = len(train_y_all)
        budget = min(budget,N)
        
        s = diversitySampling(train_x_all[:,:sample_N], pool = [], budget = budget)
        s.updateCplus()
        queried_indexs = s.newind
        for i in range(len(model_selected)):
            model_selected[i].train(np.array(train_x_all[i])[queried_indexs], np.array(train_y_all)[queried_indexs], feature_now[i])
        print(">>>>>> Terminate ...")
        write_csv(episode, csvfile, model_selected, train_x_all, test_x_all, train_y_all, test_y_all)
        episode = episode+1
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "hp:q", ["help"])
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    path = ""
    quiet = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage()
            sys.exit(0)
        if opt == '-q':
            quiet = True
        elif opt == '-p':
            path = arg
        else:
            usage()
            sys.exit(0)
    if not path:
        usage()
        sys.exit(0)

    print("Starting tag scanning...")
    tagger = Tagger(path, quiet)
    tagger.scan_audio_files()
    print("...Done!")
    print

    print("Starting file cataloguing...")
    dic = tagger.get_scanned_dic()
    cataloguer = Cataloguer(path, dic, quiet)
    cataloguer.create_catalogue()
    print("...Done!")
    print
Ejemplo n.º 10
0
 def learn(self, num_epochs, config_dict, seed):
     # config_dict contains a chosen value for each parameter
     model = Tagger(self.modelname, self.datafile, self.paramfile,
                    num_epochs, self.batchsize, **config_dict)
     # train
     metrics = model.train(num_epochs, seed, **config_dict)
     # metrics is dict = {epoch: (model, train_loss, dev_loss,test_loss, acc, f1_macro, f1_weighted)}
     return metrics
Ejemplo n.º 11
0
 def __init__(self):
     Tagger.__init__(self)
     self.upos = []
     self.model = Pipeline([
         ('vectorizer', DictVectorizer()),
         ('classifier', LogisticRegressionCV(Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, \
             solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=1, verbose=0, refit=True, intercept_scaling=1.0, \
             multi_class='ovr', random_state=None))
     ])
Ejemplo n.º 12
0
def test_run1(filename, test_product, no_of_clusters):
    conv = Converter()
    cl = Cluster()
    tg = Tagger()

    targetCategory = test_product['category']
    result = cl.test_run(filename, test_product, no_of_clusters)
    category = tg.readCategory(result)
    conv.run('r200.txt', result, category, targetCategory, 'newSum1.txt')
Ejemplo n.º 13
0
def play_ner(feature_now, model_ver, poly, niter, logit, method):
    actions = 2
    global BUDGET
    
    tf.reset_default_graph()
    if AGENT == "LSTMQ":
        robot = RobotLSTMQ(actions, FEATURE, content = CONTENT, poly = poly, logit = logit, ntype = NTYPE, expnum = EXPNUM)
    else:
        print("** There is no robot.")
        raise SystemExit

    ############NEW###############################
    model_selected = []

    for i in feature_now:
        with tf.name_scope(model_ver+'/feature_{0}'.format(i)):
            model = Tagger(model_file=model_ver+'/feature_{0}'.format(i),
                                n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i, epochs=niter, expnum = EXPNUM)
        model.train([],[],feature_number = i)
        model_selected.append(model)


    game = initialise_game(model_selected,BUDGET,NITER,FEATURE, method)
    
    
 
    ###############################################
    
    # initialise a decision robot
    
    # play game
    episode = 1

    rAll = []
    while episode <= MAX_EPISODE:

        observation = game.get_frame(model_selected)
        action = robot.get_action(observation)

        reward, observation2, terminal = game.feedback(action, model_selected)
        game.rAll.append(reward)
        rAll.append(reward)

        robot.update(observation, action, reward, observation2, terminal)

        if terminal == True:
            print("> Episodes finished: ", float("%.3f" % (episode/MAX_EPISODE)), "> Reward: ", float("%.3f" % np.mean(rAll)))
            episode += 1
            rAll = []
            if episode == MAX_EPISODE:
                print('in')
                robot.save_Q_network(MODEL_VER)
                weights = find_weight.find_weight(model_selected, game.dev_x_all, game.dev_y_all)
                np.save(model_ver+'.npy', weights)
                print(weights)
    return robot
Ejemplo n.º 14
0
    def download(self):
        audio = pafy.new(self.url).getbestaudio()
        file = audio.download()

        self.newtitle = self.slugify(audio.title)
        self.__convertToMp3(file, audio.extension)
        tagger = Tagger(self.newtitle + '.mp3', self.title, self.artist,
                        self.genre, self.album)
        mp3 = tagger.editTags()
        return self.__renameFile(mp3)
Ejemplo n.º 15
0
def mainTag(featureSet, options):
    transModel = None
    if not (options['printWeights'] or options['toCRFsuite']):
        print('loading transition model...', end='', file=sys.stderr, flush=True)
        transModel = TransModel.getModelFromFile(options['transModelFileName'])
        print('done', file=sys.stderr, flush=True)

    tagger = Tagger(featureSet, transModel, options)
    if 'inFeatFile' in options and options['inFeatFile']:
        # Tag a featurized file to to outputStream
        for sen, comment in tagger.tagFeatures(options['inFeatFile']):
            writeSentence(sen, options['outputStream'], comment)
    elif 'ioDirs' in options and options['ioDirs']:
        # Tag all files in a directory file to to fileName.tagged
        for sen, fileName in tagger.tagDir(options['ioDirs'][0]):
            writeSentence(sen, open(join(options['ioDirs'][1], '{0}.tagged'.format(fileName)), 'a', encoding='UTF-8'))
    elif 'toCRFsuite' in options and options['toCRFsuite']:
        # Make CRFsuite format to outputStream for tagging
        tagger.toCRFsuite(options['inputStream'], options['outputStream'])
    elif 'printWeights' in options and options['printWeights']:
        # Print MaxEnt weights to STDOUT
        tagger.printWeights(options['printWeights'], options['outputStream'])
    else:
        # Tag inputStream to outputStream
        for sen, comment in tagger.tagCorp(options['inputStream']):
            writeSentence(sen, options['outputStream'], comment)
Ejemplo n.º 16
0
    def __init__(self, config, tagger=False, scraper=False, prepare_db=False):
        self.config = config
        if prepare_db and not self._prepare_database():
            return

        if scraper:
            sc = Scraper(self.config['parlis']['url'])
            sc.run()
        if tagger:
            ta = Tagger()
            ta.run()
Ejemplo n.º 17
0
 def genKeyWords(self, question):
   questionToken = self.preProcess(question)
   tagger = Tagger('portugues')
   token = tagger.classify(questionToken)
   keyList = []
   for tok in token:
     if tok[1] == 'N' or re.match('ADJ', tok[1]) or re.match('V', tok[1]):
       keyList.append(tok)
   print keyList
   print len(keyList)
   return keyList
Ejemplo n.º 18
0
def AIC_predict():
    print("iter : ", config.iter)
    # 형태소분석된 raw_sentence에 PIC 처리
    # input : config.result_input_path
    # output : config.result_processed_path
    main_tagger_PIC = Tagger()
    main_tagger_PIC.taggingPIC("result_tagging")

    # PIC 처리된 raw_sentence에 AIC 적용
    # input : config.result_processed_path
    # output : config.result_output_path
    main_tagger_AIC = Tagger()
    main_tagger_AIC.evaluateAIC("result")
    main_tagger_AIC.main_taggingAIC(mode="result_tagging")
Ejemplo n.º 19
0
    def __init__(self):
        # Memuat data pre-trained POS-Tagger
        uni, bi, tri, word = self.load_obj("tagger")
        self.TAGGER1 = Tagger(uni, bi, tri, word)

        # Memuat data pre-trained POS-Tagger
        uni2, bi2, tri2, word2 = self.load_obj("tagger2")
        self.TAGGER2 = Tagger(uni2, bi2, tri2, word2)

        self.TAGGER3 = CRFTagger()
        self.TAGGER3.set_model_file(
            'dataset/all_indo_man_tag_corpus_model.crf.tagger')

        # Memuat data grammar chunker
        self.load_chunker()
Ejemplo n.º 20
0
 def gen_opt(self, file_text):
     '''
     método que gera o novo texto.
     cada palavra é classificada e concatenada com o tipo através do caractere /
     depois, é concatenado com o retorno do synset.
     '''
     tagger = Tagger('portugues')
     tok = word_tokenize(file_text.read().decode('utf-8'))
     clas = tagger.classify(tok)
     p_text = []
     for c in clas:
         if c[1] == 'N' or re.match('ADJ',c[1]) or re.match('V',c[1]) or c[1] == '.':
             gen_set = self.gen_synset(c)
             p_text.append(gen_set)
     optimized_text = ' '.join(p_text)
     return optimized_text
    def test_match_label_IOB_applied_correctly(self):
        tagger = Tagger()

        input = [[("Brunel", "", "", ""), ("University", "", "", ""),
                  ("test", "", "", ""), ("test", "", "", "")],
                 [("test", "", "", ""), ("test", "", "", ""),
                  ("Brunel", "", "", ""), ("University", "", "", "")],
                 [("test", "", "", ""), ("test", "", "", ""),
                  ("Brunel", "", "", ""), ("University", "", "", "")]]

        input_label = "Brunel University"
        input_match_tag = "match"
        output = tagger.match_label(input, input_label, input_match_tag)
        output = tagger.match_label(output, input_label, input_match_tag)
        output = tagger.match_label(output, input_label, input_match_tag)
        output = tagger.add_default_entity_tags(output)

        correct_iob = True
        for line in output:
            for token_idx, token in enumerate(line):
                if token[3].split("-", 1)[0] == "O":
                    next_token = "EOL" if len(line) == token_idx + 1 else line[
                        token_idx + 1][3].split("-", 1)[0]
                    if next_token == "I":
                        correct_iob = False
        self.assertEqual(correct_iob, True)
Ejemplo n.º 22
0
    def analyze(self, text, tokenizer=str.split):
        """Analyze text and return pretty format.

        Args:
            text: string, the input text.
            tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.

        Returns:
            res: dict.
        """
        if not self.tagger:
            self.tagger = Tagger(self.model,
                                 preprocessor=self.p,
                                 tokenizer=tokenizer)

        return self.tagger.analyze(text)
Ejemplo n.º 23
0
def run():
    print("start")
    listener = Recog()
    tagger = Tagger()
    print("done setting up")
    while True:
        try:
            sentence = listener.listen()
            # sentence = "make line graph using range from A1 to E4"
            tags = tagger.match_rules(sentence)
            print(tags)
            process(tags)
        except KeyboardInterrupt:
            break
        except Exception as e:
            continue
Ejemplo n.º 24
0
def conservative_sampling(feature_now, model_ver, budget):
    csvfile = 'records_cs/'+model_ver+'.csv'
    model_selected = []

    for i in feature_now:
        with tf.name_scope(model_ver+'/feature_{0}'.format(i)):
            model = Tagger(model_file=model_ver+'/feature_{0}'.format(i),
                                n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i)
        model.train([],[],feature_number=i)
        model_selected.append(model)
        
    train_data, test_data = data_generation(model_selected, feature_now)
    train_x_all = train_data[0]
    test_x_all = test_data[0]
    train_y_all = train_data[1]
    test_y_all = test_data[1]

    episode = 1
    print(">>>>>> Playing game ..")
    while episode <= MAX_EPISODE:
        # compute uncertaity, which is 1-confidence:

        sample_N = min(budget*4,len(train_y_all))
        
        N = len(train_y_all)
        budget = min(budget,N)
        confidence = []
        conf_diff = np.zeros((sample_N,))
        
        for i in range(len(model_selected)):
            confidence.append(model_selected[i].get_confidence(list(train_x_all[i][:sample_N])))
        # the max indecies
        ind_max = np.argmax(confidence, axis=0)
        # the min indecies
        ind_min = np.argmin(confidence, axis=0)
        for i in range(sample_N):
            conf_diff[i] = confidence[ind_max[i]][i]-confidence[ind_min[i]][i]
        queried_indexs = sorted(range(len(conf_diff)), key=lambda i: conf_diff[i])[:budget]
        
        print('top uncertainties found')
        for i in range(len(model_selected)):
            model_selected[i].train(np.array(train_x_all[i])[queried_indexs], np.array(train_y_all)[queried_indexs], feature_now[i])
        print(">>>>>> Terminate ...")
        write_csv(episode, csvfile, model_selected, train_x_all, test_x_all, train_y_all, test_y_all)
        episode = episode+1
Ejemplo n.º 25
0
class SearchEngine:
    """
    classe que considero a principal desse modulo. É a estrutura de dados que
    contém os arquivos para o processamento da engine.
    """

    def __init__(self):
        self.tagger = Tagger('portugues')

    def insert(self, files):
        """
        Esse método tem como entrada um array de arquivos, retornando uma lista de indexação reversa.
        """
        dataset = []
        for f in files:
            paragraph = sent_tokenize(f[1].lower())
            for index, p in enumerate(paragraph):
                words = word_tokenize(p)
                classes = self.tagger.classify(words)
                for c in classes:
                    if re.match('N', c[1]):
                        keysId = [item['_id'] for item in dataset]
                        print 'qtd chaves: ' + str(len(keysId))
                        if c[0] in keysId:
                            ind = keysId.index(c[0])
                            files = dataset[ind]
                            if os.path.basename(f[0]) in files.keys():
                                if not index in dataset[ind][os.path.basename(f[0])]:
                                    dataset[ind][os.path.basename(f[0])].append(index)
                            else:
                                dataset[ind][os.path.basename(f[0])] = [index]
                        else:
                            dataset.append({'_id':c[0], os.path.basename(f[0]):[index]})
        return dataset

    def extract(self, data):
        """
        Algoritmo de busca simples que retorna um tupla com os arquivos.
        """
        print 'vim pelo método extract'
        for d in data:
            paragraphs = []
            try:
                d.pop('_id')
            except KeyError:
                'ok'
            for k in d.keys(): # o [1:] para eliminar a primeira chave!
                #path_name = os.path.abspath('backend')
                path_name = os.path.abspath('backend') + '/texts/'
                #text = open(path_name+'\\texts\\'+ k + '.txt').read().decode('utf-8')
               # print path_name+'\\'+ k + '.txt'
                text = open(path_name+ k + '.txt').read().decode('utf-8')
                text_sent = sent_tokenize(text)
                for index in d[k]:
                    paragraphs.append(text_sent[index])
        print paragraphs
        return set(paragraphs)
Ejemplo n.º 26
0
def tag_file(training_filename, input_filename, output_filename):
    print('Training: ' + training_filename)
    tagger = Tagger(*process_pos_file(training_filename))

    print('Reading input: ' + input_filename)
    sentences = read_words_file(input_filename)

    print('Writing tagged output: ' + output_filename)
    count_tags = 0
    with open(output_filename, 'w') as output_file:
        length = len([word for sent in sentences for word in sent])
        for tagged_sentence in tagger.tag(sentences):
            for (tag, word) in tagged_sentence:
                output_file.write(tag + '\t' + word + '\n')
                count_tags += 1
                show_progress(count_tags / length)
            output_file.write('\n')
    print('Complete! Total tags: ' + str(count_tags) + '.')
Ejemplo n.º 27
0
def brain(command):
    response = ""
    command = command
    # from 0  =>> 15 is verb for search and find
    # from 16 =>> 21 is verb for open
    actions = [
        "search", "find", "view", "reach", "detect", "get", "catch", "explore",
        "achieve", "obtain", "pass", "check", "reveal", "expose", "observe",
        "show", "see", "listen", "hear", "open", "watch", "arise", "awaken",
        "call", "consciousness", "get up", "stir", "wake", "wake up"
    ]

    tokens = Tokenizer().tokenize(command)

    # call weather function if there is weather word and country or city name
    citiesORcountries = weatherFunction(command)
    if 'weather' in command.split() and citiesORcountries != []:
        return 'the weather in ' + citiesORcountries[0] + ' is ' + WeatherC(
        ).weatherForecast(citiesORcountries[0]) + ' today'

    action = None

    fileName = None
    # -----------------------------------<<Variable>>--------------------------------------------
    tagSentence = Tagger().tag(tokens)

    for counter in range(len(tagSentence)):
        # if tagSentence[counter][1] == 'VB' or tagSentence[counter][0] in self.actions:

        if tagSentence[counter][0] in actions:

            action = tagSentence[counter][0]

        elif tagSentence[counter][1] == 'NN':
            fileName = tagSentence[counter][0]

    normlizeAction = Normalizer().snowBallStemmer(action)

    if normlizeAction in actions:
        filePath = FileSearch().search(
            fileName)  # return list of file shared the same name

        if normlizeAction in actions[:15]:
            # for search about folder or file
            OpenMedia().openFile(filePath[0].split("//")[0])
            response = "i hope you're satisfied with our service"
            return response

        if normlizeAction in actions[15:21]:
            #if he
            if normlizeAction in [
                    'listen', 'hear', 'watch'
            ] and filePath[0].split('.')[1] != ['mp3', 'mp4', 'mkv']:

                pass
            OpenMedia().openFile(filePath[0])
    def test_pos_tag_same_nr_tokens(self):
        tagger = Tagger()
        input, output = self.pos_tag_get_results(tagger)

        input_nr_tuples = [len(line) for line in input]
        input_nr_tuples = sum(input_nr_tuples)

        output_nr_tuples = [len(line) for line in output]
        output_nr_tuples = sum(output_nr_tuples)
        self.assertEqual(input_nr_tuples, output_nr_tuples)
Ejemplo n.º 29
0
 def textAnalyse(self, text):
     t = Tagger(text)
     wordcloud = t.pos_tag()
     lines = text.split('.')
     pol_val = 0.0
     result = ''
     size = len(lines)
     for line in lines:
         temp = self.sia.polarity_scores(line)
         pol_val += temp['compound']
         print line, temp
     pol_val = pol_val / size
     if pol_val < -0.1:
         result = "n"
     elif pol_val > 0.1:
         result = "p"
     else:
         result = "x"
     print pol_val, result
     return (result, abs(pol_val) * 100.0, wordcloud)
Ejemplo n.º 30
0
    def tag(self):
        if (not os.path.exists(self.lemma_file)) or 'tag' in self.args.no_cache:
            print('Tagging')
            self.lemmas = Tagger(self.args.obt_path, self.promises).tag()

            with open(self.lemma_file, 'w') as out:
                out.write(json.dumps(self.lemmas))
        else:
            print('Reading lemmas')
            with open(self.lemma_file, 'r') as file:
                self.lemmas = json.load(file)
Ejemplo n.º 31
0
 def testCleanSeparateLineFeed(self):
     self.assertEqual(
         Tagger.cleanSeparateLineFeeds([
             'a\tbold\n', '\n', 'b\titalic\n', 'c\tunderline\n', '\n',
             'd\tteletype\n', 'e\tsize:1\n', '\n', 'f\tsize:7\n',
             'g\tcolor:000000\n', '\n', 'h\tcolor:FFFFFF\n', '\n'
         ]), [
             'a\tbold\n', 'b\titalic\n', 'c\tunderline\n', 'd\tteletype\n',
             'e\tsize:1\n', 'f\tsize:7\n', 'g\tcolor:000000\n',
             'h\tcolor:FFFFFF\n'
         ])
    def test_pos_tag_nonlocalner_labels_not_altered(self):
        tagger = Tagger()
        input, output = self.pos_tag_get_results(tagger)

        same_nlner_labels_returned = True
        for line_idx, line in enumerate(output):
            for tuple_idx, tuple in enumerate(line):
                if tuple[2] != input[line_idx][tuple_idx][2]:
                    same_nlner_labels_returned = False

        self.assertEqual(same_nlner_labels_returned, True)
    def test_match_label_pos_labels_not_altered(self):
        tagger = Tagger()
        input, output = self.match_label_get_results(tagger)

        same_pos_labels_returned = True
        for line_idx, line in enumerate(output):
            for tuple_idx, tuple in enumerate(line):
                if tuple[1] != input[line_idx][tuple_idx][1]:
                    same_pos_labels_returned = False

        self.assertEqual(same_pos_labels_returned, True)
    def test_pos_tag_same_token_strs_returned(self):
        tagger = Tagger()
        input, output = self.pos_tag_get_results(tagger)

        same_tokens_returned = True
        for line_idx, line in enumerate(output):
            for tuple_idx, tuple in enumerate(line):
                if tuple[0] != input[line_idx][tuple_idx][0]:
                    same_tokens_returned = False

        self.assertEqual(same_tokens_returned, True)
Ejemplo n.º 35
0
def pos_tag(tweets):
    """
    Uses the POS tagger interface to tag part-of-speech in all the tweets texts, stores it as dict in the tweet objects.
    """
    print "Tagging..."
    untagged_texts = []
    for tweet in tweets:
        tagger = Tagger()
        textbody = tweet.text
        for phrase in re.split("\.|!|\?", textbody):
            if len(phrase)<2: continue
            phrase = string.replace(phrase, "?", "")
            phrase = string.replace(phrase, "!", "")
            phrase = string.replace(phrase, ".", "")
            tags = tagger.tag_text(phrase)
            if tags!=None:
                tweet.tagged_words.append(tags)
    print "Untagged texts: "
    for text in untagged_texts:
        print text
    print "Tagging done."
    return tweets
Ejemplo n.º 36
0
    def __init__(self, p):
        self.save_dir, _ = generate_directory(p.save_to)
        self.p = p

        print_p(self.p)

        self.tagger = Tagger.create_tagger(self.p)

        if 'load_from' in self.p and (self.p.load_from is not None):
            self.load_model(self.p.load_from)

        logger.info('Setting up data...')
        self.streams = setup_data(self.p, use_unlabeled=True, use_labeled=True)
Ejemplo n.º 37
0
def main(training_file, training_dir, load_model, skip_train):
    logging.debug('Initializing random seed to 0.')
    random.seed(0)
    np.random.seed(0)

    if load_model:
        tagger = Tagger.load(load_model)
        data = TaggingDataset.load_from_file(training_file, vocab=tagger.vocab, tags=tagger.tags)
    else:
        assert not skip_train, 'Cannot --skip_train without a saved model.'
        logging.debug('Loading dataset from: %s' % training_file)
        data = TaggingDataset.load_from_file(training_file)
        logging.debug('Initializing model.')
        tagger = Tagger(data.vocab, data.tags)

    if not skip_train:
        train_data, dev_data = data.split(0.7)

        batches_train = train_data.prepare_batches(n_seqs_per_batch=10)
        batches_dev = dev_data.prepare_batches(n_seqs_per_batch=100)

        train_mgr = TrainingManager(
            avg_n_losses=len(batches_train),
            training_dir=training_dir,
            tagger_taste_fn=lambda: taste_tagger(tagger, batches_train),
            tagger_dev_eval_fn=lambda: eval_tagger(tagger, batches_dev),
            tagger_save_fn=lambda fname: tagger.save(fname)
        )

        logging.debug('Starting training.')
        while train_mgr.should_continue():
            mb_x, mb_y = random.choice(batches_train)
            mb_loss = tagger.learn(mb_x, mb_y)

            train_mgr.tick(mb_loss=mb_loss)

    evaluate_tagger_and_writeout(tagger)
Ejemplo n.º 38
0
 def generation(self):
     self.tokenized = [nltk.word_tokenize(self.sentences[i]) for i in range(len(self.sentences))]
     self.generate_average_position()
     self.types = {}
     tagger = Tagger(False)
     for i in range(len(self.tokenized)):
         typess = tagger.tag_sent(self.tokenized[i])
         for j in range(len(typess)):
             word,val = typess[j]
             if(not word in self.types):
                 self.types[word] = []
                 self.types[word].append(val)
             else:
                 self.types[word].append(val)
     for element in self.types:
         most_common,num_most_common = Counter(self.types[element]).most_common(1)[0] 
         self.types[element] =most_common
     num_sent  = 1 
     for sent in self.tokenized:
         actual = sent
         num_word = 1
         last = None
         for mot in actual:
             actual = None
             if(not self.isWordIn(mot)):
                 tmp = Etiquette(mot,num_sent,num_word)
                 self.nodes.append(tmp)
                 actual = tmp
             else:
                  actual = self.get_node_with_value(mot)
                  actual.add_sid_pid(num_sent,num_word)
             if(num_word>1):
                 last.add_next(actual.get_id())
             last = actual
             num_word +=1
         num_sent +=1
Ejemplo n.º 39
0
def main(args):
    logging.debug('Initializing random seed to 0.')
    random.seed(0)
    np.random.seed(0)
    tf.set_random_seed(0)

    logging.debug('Loading training dataset from: %s' % args.training_file)
    train_data = TaggingDataset.load_from_file(args.training_file)
    dev_data = TaggingDataset.load_from_file(None, vocab=train_data.vocab,
                                             alphabet=train_data.alphabet, tags=train_data.tags)
    logging.debug('Initializing model.')
    tagger = Tagger(train_data.vocab, train_data.tags, train_data.alphabet,
                    word_embedding_size=args.word_embedding_size,
                    char_embedding_size=args.char_embedding_size,
                    num_chars=args.max_word_length,
                    num_steps=args.max_sentence_length,
                    optimizer_desc=args.optimizer,
                    generate_lemmas=args.generate_lemmas,
                    l2=args.l2,
                    dropout_prob_values=[float(x) for x in args.dropout.split(",")],
                    experiment_name=args.exp_name,
                    supply_form_characters_to_lemma=args.supply_form_characters_to_lemma,
                    threads=args.threads,
                    use_attention=args.use_attention,
                    scheduled_sampling=args.scheduled_sampling)

    batches_train = train_data.prepare_batches(
        args.batch_size, args.max_sentence_length, args.max_word_length)
    batches_dev = dev_data.prepare_batches(
        2100, args.max_sentence_length, args.max_word_length)

    train_mgr = TrainingManager(
        len(batches_train), args.eval_interval,
        training_dir=args.training_dir,
        tagger_taste_fn=lambda: taste_tagger(tagger, batches_train),
        tagger_dev_eval_fn=lambda: eval_tagger(tagger, batches_dev),
        tagger_save_fn=lambda fname: tagger.save(fname)
    )

    import signal
    force_eval = {"value": False}
    def handle_sigquit(signal, frame):
        logging.debug("Ctrl+\\ recieved, evaluation will be forced.")
        force_eval["value"] = True
        pass
    signal.signal(signal.SIGQUIT, handle_sigquit)

    logging.debug('Starting training.')
    try:
        permuted_batches = []
        while train_mgr.should_continue(max_epochs=args.max_epochs):
            if not permuted_batches:
                permuted_batches = batches_train[:]
                random.shuffle(permuted_batches)
            words, chars, tags, lengths, lemma_chars, chars_lengths = permuted_batches.pop()
            oov_mask = np.vectorize(lambda x: train_data.vocab.count(x) == 1 and np.random.uniform() < args.oov_sampling_p)(words)
            words = np.where(oov_mask, np.zeros(words.shape), words)
            mb_loss = tagger.learn(words, chars, tags, lengths, lemma_chars, chars_lengths)

            train_mgr.tick(mb_loss=mb_loss, force_eval=force_eval["value"])
            force_eval["value"] = False
    except KeyboardInterrupt:
        logging.debug("Ctrl+C recieved, stopping training.")

    run_tagger_and_writeout(tagger, dev_data)
Ejemplo n.º 40
0
 def testQuantifierPriorityAsterisk(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('******'), '*')
Ejemplo n.º 41
0
 def testQuantifierPriorityExclamation(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('!!!!!!'), '!')
Ejemplo n.º 42
0
    def testExpressionValidatorTwoDots(self):
        with self.assertRaises(SystemExit) as exit_val:
            Tagger.validateExpressions(['a..b'])

        self.assertEqual(exit_val.exception.code, 4)
Ejemplo n.º 43
0
    def testCommandValidatorInvalidColor(self):
        with self.assertRaises(SystemExit) as exit_val:
            Tagger.validateCommands([['color:qwe@*fefeee']])

        self.assertEqual(exit_val.exception.code, 4)
Ejemplo n.º 44
0
 def testQuantifierPriorityExclamationAndPlus(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('+++!++'), '!')
Ejemplo n.º 45
0
    def testExpressionValidatorNegationAndNegation(self):
        with self.assertRaises(SystemExit) as exit_val:
            Tagger.validateExpressions(['a!!b'])

        self.assertEqual(exit_val.exception.code, 4)
Ejemplo n.º 46
0
    def testExpressionValidatorEmptyBrackets(self):
        with self.assertRaises(SystemExit) as exit_val:
            Tagger.validateExpressions(['(())'])

        self.assertEqual(exit_val.exception.code, 4)
Ejemplo n.º 47
0
 def testQuantifierPriorityAsteriskAndPlus(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('+**+*'), '*')
Ejemplo n.º 48
0
 def testQuantifierPriorityPlusAndPipe(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('|++||+'), '+')
Ejemplo n.º 49
0
    def testExpressionValidatorAsteriskAndExclamation(self):
        with self.assertRaises(SystemExit) as exit_val:
            Tagger.validateExpressions(['a*!b'])

        self.assertEqual(exit_val.exception.code, 4)
Ejemplo n.º 50
0
    def testExpressionValidatorPlusAndAsterisk(self):
        with self.assertRaises(SystemExit) as exit_val:
            Tagger.validateExpressions(['a+*b'])

        self.assertEqual(exit_val.exception.code, 4)
Ejemplo n.º 51
0
    def testExpressionValidatorDisjunctionAndConcatenation(self):
        with self.assertRaises(SystemExit) as exit_val:
            Tagger.validateExpressions(['a|.b'])

        self.assertEqual(exit_val.exception.code, 4)
Ejemplo n.º 52
0
 def testQuantifierPriorityDot(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('..........'), '.')
Ejemplo n.º 53
0
 def testQuantifierPriorityExclamationAndDot(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('..!!..'), '!')
Ejemplo n.º 54
0
 def testQuantifierPriorityExclamationAndPipe(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('||!!|'), '!')
Ejemplo n.º 55
0
 def testReformatMultipleQuantifiers(self):
     self.assertEqual(Tagger.bonusReformatMutlipleQuantifiers('sub+++'), 'sub+')
Ejemplo n.º 56
0
 def testQuantifierPriorityPipe(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('||||'), '|')
Ejemplo n.º 57
0
 def testQuantifierPriorityPlus(self):
     self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('+++++++'), '+')
Ejemplo n.º 58
0
def evaluate(n, corpus):
    """Runs the n-fold validation on a corpus"""
    if n < 1:
        n = 10
        print("n was to low and has been set to 10\n")

    # Get all the data
    sentences, correctTags, tagData = corpusReader(corpus)
    allCor = []
    allIncor = []


    for check in range(1, n+1):
        # Divide all the data
        divSent = dividList(sentences, n, check)
        divTags = dividList(correctTags, n, check)
        divTrain = dividList(tagData, n, check)
        # To count the total of incorrect and correct tags
        correctlyTagged = []
        incorrectlyTagged = []

        print("Check {} doing {}-fold on {}\n".format(check, n, corpus))

        # For each part to evaluate
        for i in range(0, n):
            # Get the parts to train on
            trainingParts = divTrain[:i] + divTrain[i+1:]
            train = []

            # They need to be formatted so that we can use the Tagger
            for index in range(len(trainingParts)):
                train.extend(trainingParts[index])
        
                # Get the testing and evaluation data
                testingData = divSent[i]
                evaluationData = divTags[i]
        
                # Do some training
                uni, bi, tri, word = PB(train)
                tagger = Tagger(uni, bi, tri, word)
        
                # Reset counts
                correctTagCount = 0
                incorrectTagCount = 0

            # Go through each sentence and tag it
            for index in range(len(testingData)):
                tagged = tagger.tagSentence(testingData[index])
                for tag in range(len(tagged)):
                    # If correct
                    if evaluationData[index][tag] == tagged[tag]:
                        correctTagCount += 1
                    else:
                        incorrectTagCount += 1
            
            # Print to let you know I haven't forgotten about you.
            print("{}-fold was tagged {}% correctly.".format(i+1, round(correctTagCount / (correctTagCount + incorrectTagCount)*100,2 )))

            # Save n-fold counts
            correctlyTagged.append(correctTagCount)
            incorrectlyTagged.append(incorrectTagCount)

        allCor.extend(correctlyTagged)
        allIncor.extend(incorrectlyTagged)

        # Total in numbers..
        print("\n{} out of {} was correctly tagged.".format(sum(correctlyTagged), sum(correctlyTagged) + sum(incorrectlyTagged)))

        # .. and percentage
        print("\nFor a total of {}% correctness.".format(round(sum(correctlyTagged) / (sum(correctlyTagged) + sum(incorrectlyTagged))*100, 2)))

    # Total in numbers..
    print("\n{} out of {} was correctly tagged.".format(sum(allCor), sum(allCor) + sum(allIncor)))

    # .. and percentage
    print("\nFor a total of {}% correctness.".format(round(sum(allCor) / (sum(allCor) + sum(allIncor))*100, 2)))
Ejemplo n.º 59
0
 def testCommandValidatorMaxSize(self):
     Tagger.validateCommands([['size:7']])
     self.assertTrue(self)