Esempi in Python per Tagger, esempi in Python per tagger.Tagger

Esempio n. 1

0

Mostra file

def main():
    sv_tags = [
        "<ROOT>", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN",
        "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB"
    ]
    en_tags = [
        "<ROOT>", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN",
        "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
    ]
    arc_tags = [
        '<ROOT>', 'name', 'nsubjpass', 'dobj', 'acl', 'advcl', 'advmod',
        'amod', 'appos', 'aux', 'case', 'cc', 'ccomp', 'clf', 'compound',
        'conj', 'cop', 'csubj', 'dep', 'det', 'discourse', 'det', 'discourse',
        'dislocated', 'expl', 'fixed', 'flat', 'goeswith', 'iobj', 'list',
        'mark', 'nmod', 'nsubj', 'nummod', 'obj', 'obl', 'orphan', 'parataxis',
        'punct', 'reparandum', 'root', 'vocative', 'xcomp'
    ]

    sv_train_file = "treebanks/sv_train.conllu"
    sv_test_file = "treebanks/sv_dev.conllu"
    en_train_file = "treebanks/en_train.conllu"
    en_test_file = "treebanks/en_dev.conllu"

    #model = wsm.WSM()
    #model.create_model(en_train_file)

    model_path = 'models/wsm_en'
    myTagger = tagger.Tagger(en_tags)
    arc_tagger = tagger.Tagger(arc_tags)
    myParser = parser.Parser(myTagger, arc_tagger, model_path)
    dataReader.evaluate(en_train_file, en_test_file, myParser)
    dataReader.evaluate(sv_train_file, sv_test_file, myParser)

Esempio n. 2

0

Mostra file

 def __init__(self,
              backtrack,
              dictionary,
              mapping,
              threshold=0.0,
              do_backtrack=False,
              debug=False,
              type_weight={
                  'firsttitle': 3,
                  'title': 2,
                  'text': 1
              }):
     self._debug = debug
     self._backtrack = backtrack
     self._do_backtrack = do_backtrack
     self._threshold = threshold
     self._dictionary = dictionary
     self._mapping = mapping
     self._pages = set()
     self._type_entity_page = {}
     self._type_page_entity_count = {}
     self._page_entity_synonyms = {}
     self._tag = tagger.Tagger()
     self._max_tokens = 15
     self._type_weight = type_weight
     self._tag.LoadNames("%s_entities.tsv" % self._dictionary,
                         "%s_names_expanded.tsv" % self._dictionary)
     self._entity_name = self.load_names()

Esempio n. 3

0

Mostra file

def create_protein_engine(ddef, context):
    db = context['db']
    logger = context['logger']
    logger.info('preparing protein engine...')
    dict_c_name = ddef['dictionary_collection']
    logger.info('\tdictionary collection name is: ' + dict_c_name)
    blacklist_name = ddef['blacklist']
    logger.info('\tblacklist collection name is: ' + blacklist_name)
    whitelist_name = ddef['whitelist']
    logger.info('\twhitelist collection name is: ' + whitelist_name)
    entity_type = ddef['entity_type']

    tgr = tagger.Tagger()
    for edict in db[dict_c_name].find():
        for entry in edict['dictionary']:
            p_a = entry[PRIMARY_ACCESSION].encode("utf-8")
            for word in entry['words']:
                tgr.add_name(word.encode("utf-8"), entity_type, p_a)

    black_timestamp = tagger_block_blacklist(db[blacklist_name], tgr)
    white_timestamp = str_DEFAULT_TIMESTAMP
    wlitems = ddef.get('whitelist_items', None)
    if len(whitelist_name) > 0 and wlitems:
        white_timestamp = tagger_add_whitelist(db[whitelist_name], tgr, entity_type,  \
            wlitems[0], wlitems[1])
    return tgr, black_timestamp, white_timestamp

Esempio n. 4

0

Mostra file

File: musicly.py Progetto: meantheory/musicaly

def loadsong():
    x = {}
    for song in tagger.Tagger().scan():
        m = hashlib.sha256()
        m.update(song.link.encode("utf-8"))
        x[m.digest()] = song
    return x

Esempio n. 5

0

Mostra file

def create_chemical_engine(ddef, context):
    db = context['db']
    logger = context['logger']
    logger.info('preparing chemical engine...')
    dict_c_name = ddef['dictionary_collection']
    logger.info('\tdictionary collection name is: ' + dict_c_name)
    blacklist_name = ddef['blacklist']
    logger.info('\tblacklist collection name is: ' + blacklist_name)
    whitelist_name = ddef['whitelist']
    logger.info('\twhitelist collection name is: ' + whitelist_name)
    entity_type = ddef['entity_type']
    tgr = tagger.Tagger()
    if len(dict_c_name) > 0:
        for entry in db[dict_c_name].find():
            c_key = entry['key'].encode("utf-8")
            for word in entry['words']:
                tgr.add_name(word.encode("utf-8"), entity_type, c_key)
    black_timestamp = str_DEFAULT_TIMESTAMP
    if len(blacklist_name) > 0:
        black_timestamp = tagger_block_blacklist(db[blacklist_name], tgr)
    white_timestamp = str_DEFAULT_TIMESTAMP
    wlitems = ddef.get('whitelist_items', None)
    if len(whitelist_name) > 0 and wlitems:
        white_timestamp = tagger_add_whitelist(db[whitelist_name], tgr, entity_type, \
                wlitems[0], wlitems[1])
    logger.info('\tload chemical dictionary completed.')
    return tgr, black_timestamp, white_timestamp

Esempio n. 6

0

Mostra file

 def __init__(self, entities=None, names=None, globs=None, data_dir=None):
     # Defaults #
     if data_dir is None: data_dir = default_data_dir
     if entities is None: entities = data_dir + 'envo_entities.tsv'
     if names is None: names = data_dir + 'envo_names.tsv'
     if globs is None: globs = data_dir + 'envo_global.tsv'
     # Make an instance of the API #
     import tagger as tagger_api
     self.api = tagger_api.Tagger()
     self.api.LoadNames(entities, names)
     self.api.LoadGlobal(globs)

Esempio n. 7

0

Mostra file

File: fcgi.py Progetto: eazlong/semantic

def predict(json_data, result):
    try:
        appid = json_data['appid']
        sentence = json_data['sentence']
        sentence = process_chinese_number(sentence)
        e = engines[appid]
        vf = vocab_file(appid, CLASSIFIER)
        _, v = dp.build_vocabulary(vf)
        labels = load_labels(appid)
        # jieba.load_userdict(vf)

        user_dict = load_dict(data_dir(appid) + "user_dict.txt")
        load_dict_for_jiaba(user_dict)

        reload = False
        if 'reload' in e:
            reload = e['reload']
        if (CLASSIFIER not in e) or reload:
            e[CLASSIFIER] = cc.Classifier(v, appid,
                                          data_dir(appid) + 'classifier',
                                          labels)

        operation = e[CLASSIFIER].predict(sentence)

        if operation == '':
            return 'operation not gotten'

        logging.info("the operation is %s" % operation)

        category = appid + "_" + operation
        vf = vocab_file(appid, category)
        if (operation not in e) or reload:
            e[operation] = tagger.Tagger(data_file(appid, operation), vf,
                                         category,
                                         data_dir(appid) + category, user_dict)

        pairs = e[operation].determine(sentence)

    except Exception as e:
        logging.error(e)
        logging.debug(format_tb(e.__traceback__))
        return e

    result['appid'] = appid
    result['operation'] = operation
    result['data'] = pairs
    return result

Esempio n. 8

0

Mostra file

 def __init__(self,
              backtrack,
              dictionary,
              mapping,
              threshold=0.0,
              do_backtrack=False,
              debug=False):
     self._debug = debug
     self._backtrack = backtrack
     self._do_backtrack = do_backtrack
     self._threshold = threshold
     self._dictionary = dictionary
     self._mapping = mapping
     self._pages = set()
     self._page_entity = {}
     self._tag = tagger.Tagger()
     self._max_tokens = 15
     self._tag.LoadNames("%s_entities.tsv" % self._dictionary,
                         "%s_names_expanded.tsv" % self._dictionary)
     self._entity_name = self.load_names()

Esempio n. 9

0

Mostra file

    def extract_event(self):
        self.dump_information()


class TodoExtractor(Extractor):
    def __init__(self, toks):
        Extractor.__init__(self, toks)

    def extract_todo(self):
        self.dump_information()


if __name__ == "__main__":
    common = Commonwords(commonwords_path)

    event_tagger = tagger.Tagger(classifier="event.ser.gz", port=1111)
    todo_tagger = tagger.Tagger(classifier="todo.ser.gz", port=2222)
    all_tagger = tagger.Tagger(classifier="all.ser.gz", port=3333)

    filenames = [
        f for f in os.listdir(test_dir)
        if os.path.isfile(os.path.join(test_dir, f))
    ]
    classifier = Classifier(all_tagger, common)
    for filename in filenames:
        if filename == "freq":
            continue
        msg = open(os.path.join(test_dir, filename), "r").read()
        print "++++++++++++++++++++++++++++++++++++++++"
        print msg
        print "++++++++++++++++++++++++++++++++++++++++"

Esempio n. 10

0

Mostra file

import logging
import tagger
import geocoder
from hasher import hashsong
from laststuff import getartistinfo
from icecream import ic as debug
from flask import Flask, render_template, Response

app = Flask(__name__)
app.secret_key = "music"
app.config["TESTING"] = True

root = logging.getLogger()
root.setLevel(logging.DEBUG)

songsname = tagger.Tagger()


def loadsong():
    x = {}
    for song in songsname.scan():
        m = hashsong(song)
        x[m.digest().hex()] = song
    return x


mysong = loadsong()
# debug(mysong)


@app.route("/")

Esempio n. 11

0

Mostra file

File: corpus2mapping.py Progetto: janosbinder/anything2doid

entry_filter = ""

for o, a in opts:
    if o == "-t":
        threshold = float(a)
    elif o == "-m":
        mapping = a
        #if not os.path.isfile(corpus_filename):
        #	print >> sys.stderr, "%s does not exits." % corpus_filename
        #	sys.exit(2)
    elif o == "-d":
        dictionary = a
    elif o == "-f":
        dictionary_filter = a

tag = tagger.Tagger()
tag.LoadNames("%s_entities.tsv" % dictionary,
              "%s_names_expanded.tsv" % dictionary)

#check ignore stuff

entry_filter = {}
if not os.path.isfile(dictionary_filter):
    for line in open(dictionary_filter):
        f = line[:-1].split("\t")
        entry_filter[f[0]] = f[1]

serial_entity = {}
for line in open("%s_entities.tsv" % dictionary):
    serial, t, entry = line[:-1].split("\t")
    serial_entity[serial] = entry

Esempio n. 12

0

Mostra file

File: fcgi.py Progetto: eazlong/semantic

def train(data):
    try:
        json_data = json.loads(data.decode('utf-8'))
        appid = json_data['appid']
        train_data = json_data['data']
        if appid not in engines:
            engines[appid] = {}
        engines[appid]['reload'] = True
        logging.info("start train for %s" % appid)
        # ce
        base_dir = data_dir(appid)
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        # 创建标签文件，保存分类标签
        classifer_data = {}
        labels = list(train_data.keys())
        logging.info("labels %s" % labels)

        lf = labels_file(appid)
        with open(lf, "w", encoding='utf-8') as f:
            data = train_data.keys()
            result = map(lambda x: x.strip() + "\n", data)
            f.writelines(result)

        user_dict = load_dict(data_dir(appid) + "user_dict.txt")
        load_dict_for_jiaba(user_dict)

        # 创建分类后的数据文件
        data_list = []
        for d in train_data.items():
            operation = d[0]
            t_data = d[1]
            df = data_file(appid, operation)
            category = appid + "_" + operation
            logging.info("create tagger for %s" % category)
            data_list.append((df, category))

            with open(df, "w", encoding='utf-8') as f:
                for line in t_data:
                    f.write(line + "\r\n")
                    pattern = re.compile(r'\[.*?\]')
                    line = re.sub(pattern, '', line)
                    classifer_data[line] = operation

        for data in data_list:
            df, category = data
            vf = vocab_file(appid, category)
            dp.create_vocabulary_from_data_file(vf, df)
            step = 2
            t = tagger.Tagger(df, vf, category,
                              data_dir(appid) + category, user_dict, step)
            t.train()

        vf = vocab_file(appid, CLASSIFIER)
        d = ".".join(list(classifer_data.keys()))
        lex, v = dp.create_vocabulary_from_data(vf, d, False)
        calssifier = cc.Classifier(v, appid,
                                   data_dir(appid) + 'classifier', labels)
        calssifier.train(classifer_data)
    except Exception as e:
        logging.error(e)
        logging.debug(format_tb(e.__traceback__))
        return False

    return True

Esempio n. 13

0

Mostra file

File: runner.py Progetto: tushar8049/POS-Tagging

import tagger as tag
"""
Loading Corpus
"""
corpus_path = 'brown_corpus_modified'
myTagger = tag.Tagger()
tokenList = myTagger.load_corpus(corpus_path)
"""
Initializing Frequencies and then the Probabilities for:
Initial Tagging, Transition, Emission
"""
for sentence in tokenList:
    myTagger.initialize_frequencies(sentence)
myTagger.initialize_probabilities()
# print("Initial Tag Probability: \n", myTagger.get_initial_tag_probability())
#
# print("Transition Probability: \n", myTagger.get_transition_probability())
#
# print("Emission Probability: \n", myTagger.get_emission_probability())
"""
Testing the Tagging using Viterbi Decode for two sentences:
1. The Secretariat is expected to race tomorrow .
2. People continue to enquire the reason for the race for outer space .
"""
sentence1 = "The Secretariat is expected to race tomorrow ."
sentence2 = "People continue to enquire the reason for the race for outer space ."
print("Tagging of Sentence 1: ", myTagger.viterbi_decode(sentence1))
print("Tagging of Sentence 2: ", myTagger.viterbi_decode(sentence2))

Esempio n. 14

0

Mostra file

File: tagger1.py Progetto: abendayan/window-tagging

import sys
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import numpy as np
import time
import tagger as tagg
start_time = time.time()

if __name__ == '__main__':
    folder = sys.argv[1]
    tagger_training = tagg.Tagger(folder + '/train')
    tagger_training.define_all_data()
    tagger_dev = tagg.Tagger(folder + '/dev')
    tagg.model = tagg.Net(len(tagger_training.vocab),
                          len(tagger_training.tags))
    tagg.optimizer = optim.Adam(tagg.model.parameters(), lr=0.001)
    tagger_dev.define_x_y_not_train(tagger_training.vocab,
                                    tagger_training.tags_to_ix, True)
    tagger_test = tagg.Tagger(folder + '/test')
    tagger_test.define_x_y_not_train(tagger_training.vocab,
                                     tagger_training.tags_to_ix, False)

    losses = []
    epochs = 30
    data = []
    for epoch in range(epochs):
        print "Start epoch " + str(epoch), str(tagg.passed_time(start_time))

Esempio n. 15

0

Mostra file

import os
import pickle
import tagger

datafile = os.path.join(os.path.dirname(__file__), '..', 'data/dict.pkl')
# print datafile
weights = pickle.load(open(datafile, 'rb'))
rdr = tagger.Reader()
stmr = tagger.Stemmer()
rtr = tagger.Rater(weights)

extract_tags = tagger.Tagger(rdr, stmr, rtr)

Esempio n. 16

0

Mostra file

    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
    'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
    'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',
    'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'skills'
]

weights = pickle.load(open('jobtagdict.pkl', 'rb'))  # or your own dictionary
# weights2 = pickle.load(open('skilltagdict.pkl', 'rb'))
weights2 = pickle.load(open('tagger/data/dict.pkl', 'rb'))
myreader = tagger.Reader()  # or your own reader class
mystemmer = tagger.Stemmer()  # or your own stemmer class
myrater = tagger.Rater(weights)  # or your own... (you got the idea)
myrater2 = tagger.Rater(weights2)
jobtagger = tagger.Tagger(myreader, mystemmer, myrater)
skilltagger = tagger.Tagger(myreader, mystemmer, myrater2)

# In[ ]:

with open("stopwords.pkl") as f:
    l = JobListings(f.read())
    save_object(l.listings, r'joblistings-cs.pkl')

# In[46]:

with open('jobtagcorpus.txt', 'wb') as f:
    for job in listings:
        f.write(job.desc)
        if getattr(job, 'qualifications', False):
            f.write(job.qualifications)

Esempio n. 17

0

Mostra file

import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import numpy as np
import time
import tagger as tagg
start_time = time.time()

if __name__ == '__main__':
    folder = sys.argv[1]
    # TODO make it optionnal
    vecs_file = sys.argv[2] if len(sys.argv) > 2 else False
    vocab_file = sys.argv[3] if len(sys.argv) > 3 else False
    tagger_training = tagg.Tagger(folder + '/train', True)
    # if we use the embedding words
    if vecs_file:
        name = ".embed."
        tagger_training.define_all_data_from_file(vecs_file, vocab_file)
    else:
        name = ""
        tagger_training.define_all_data()
    tagger_training.create_sub_words()
    tagger_training.activate_sub_words(tagger_training.vecs)
    tagger_dev = tagg.Tagger(folder + '/dev', True)
    tagg.model = tagg.Net(len(tagger_training.vocab),
                          len(tagger_training.tags))
    tagg.model.embeddings.weight.data.copy_(
        torch.from_numpy(tagger_training.vecs))
    tagg.optimizer = optim.Adam(tagg.model.parameters(), lr=0.001)

Esempio n. 18

0

Mostra file

# install Goose https://github.com/grangier/python-goose
#
# Done so far: basic keyword extraction using tagger works.
#
# Concerns about keyword extraction using Tagger library:
# https://github.com/apresta/tagger
# - dictionary should be built from relevant corpi to article to be more
# 	effective at attracting attention in immersive interface
# - TF-IDF is a function provided in the module build_dict... if articles
# 	in collection ever accumulate enough around one subject, use TF-IDF
#
# immediate todos:
# - implement multitag

from goose import Goose
import tagger
import pickle

url = "http://www.theverge.com/2014/9/11/6136443/the-largest-predatory-dinosaur-ever-was-half-duck-half-crocodile"
g = Goose()
article = g.extract(url=url).cleaned_text

weights = pickle.load(open('data/dict.pkl', 'rb'))  # or your own dictionary
mytagger = tagger.Tagger(tagger.Reader(), tagger.Stemmer(),
                         tagger.Rater(weights))
best_3_tags = mytagger(article, 6)
print best_3_tags

Esempio n. 19

0

Mostra file

    import tagger
except:
    sys.path.append("/home/app/tagger")
    import tagger

#############################################################################
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='JensenLab Tagger NER utilities.')
    ops = ['tbe', 'tbe2']
    parser.add_argument("op", choices=ops, help='operation')
    parser.add_argument("--i",
                        required=True,
                        dest="ifile",
                        help="input (CSV|TSV|SSV|TXT)")
    parser.add_argument("--o", dest="ofile", help="output (CSV|TSV)")
    parser.add_argument("-v", "--verbose", default=0, action="count")
    args = parser.parse_args()

    fin = open(args.ifile)

    fout = open(args.ofile, "w") if args.ofile else sys.stdout

    tg = tagger.Tagger(java_script=None, re_stop=None, serials_only=False)

    doc = tg.load_local(args.ifile)

    #tg.get_entities(doc, docid, etypes)

    #tg.get_matches(doc, docid, etypes)