def main(): sv_tags = [ "<ROOT>", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB" ] en_tags = [ "<ROOT>", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X" ] arc_tags = [ '<ROOT>', 'name', 'nsubjpass', 'dobj', 'acl', 'advcl', 'advmod', 'amod', 'appos', 'aux', 'case', 'cc', 'ccomp', 'clf', 'compound', 'conj', 'cop', 'csubj', 'dep', 'det', 'discourse', 'det', 'discourse', 'dislocated', 'expl', 'fixed', 'flat', 'goeswith', 'iobj', 'list', 'mark', 'nmod', 'nsubj', 'nummod', 'obj', 'obl', 'orphan', 'parataxis', 'punct', 'reparandum', 'root', 'vocative', 'xcomp' ] sv_train_file = "treebanks/sv_train.conllu" sv_test_file = "treebanks/sv_dev.conllu" en_train_file = "treebanks/en_train.conllu" en_test_file = "treebanks/en_dev.conllu" #model = wsm.WSM() #model.create_model(en_train_file) model_path = 'models/wsm_en' myTagger = tagger.Tagger(en_tags) arc_tagger = tagger.Tagger(arc_tags) myParser = parser.Parser(myTagger, arc_tagger, model_path) dataReader.evaluate(en_train_file, en_test_file, myParser) dataReader.evaluate(sv_train_file, sv_test_file, myParser)
def __init__(self, backtrack, dictionary, mapping, threshold=0.0, do_backtrack=False, debug=False, type_weight={ 'firsttitle': 3, 'title': 2, 'text': 1 }): self._debug = debug self._backtrack = backtrack self._do_backtrack = do_backtrack self._threshold = threshold self._dictionary = dictionary self._mapping = mapping self._pages = set() self._type_entity_page = {} self._type_page_entity_count = {} self._page_entity_synonyms = {} self._tag = tagger.Tagger() self._max_tokens = 15 self._type_weight = type_weight self._tag.LoadNames("%s_entities.tsv" % self._dictionary, "%s_names_expanded.tsv" % self._dictionary) self._entity_name = self.load_names()
def create_protein_engine(ddef, context): db = context['db'] logger = context['logger'] logger.info('preparing protein engine...') dict_c_name = ddef['dictionary_collection'] logger.info('\tdictionary collection name is: ' + dict_c_name) blacklist_name = ddef['blacklist'] logger.info('\tblacklist collection name is: ' + blacklist_name) whitelist_name = ddef['whitelist'] logger.info('\twhitelist collection name is: ' + whitelist_name) entity_type = ddef['entity_type'] tgr = tagger.Tagger() for edict in db[dict_c_name].find(): for entry in edict['dictionary']: p_a = entry[PRIMARY_ACCESSION].encode("utf-8") for word in entry['words']: tgr.add_name(word.encode("utf-8"), entity_type, p_a) black_timestamp = tagger_block_blacklist(db[blacklist_name], tgr) white_timestamp = str_DEFAULT_TIMESTAMP wlitems = ddef.get('whitelist_items', None) if len(whitelist_name) > 0 and wlitems: white_timestamp = tagger_add_whitelist(db[whitelist_name], tgr, entity_type, \ wlitems[0], wlitems[1]) return tgr, black_timestamp, white_timestamp
def loadsong(): x = {} for song in tagger.Tagger().scan(): m = hashlib.sha256() m.update(song.link.encode("utf-8")) x[m.digest()] = song return x
def create_chemical_engine(ddef, context): db = context['db'] logger = context['logger'] logger.info('preparing chemical engine...') dict_c_name = ddef['dictionary_collection'] logger.info('\tdictionary collection name is: ' + dict_c_name) blacklist_name = ddef['blacklist'] logger.info('\tblacklist collection name is: ' + blacklist_name) whitelist_name = ddef['whitelist'] logger.info('\twhitelist collection name is: ' + whitelist_name) entity_type = ddef['entity_type'] tgr = tagger.Tagger() if len(dict_c_name) > 0: for entry in db[dict_c_name].find(): c_key = entry['key'].encode("utf-8") for word in entry['words']: tgr.add_name(word.encode("utf-8"), entity_type, c_key) black_timestamp = str_DEFAULT_TIMESTAMP if len(blacklist_name) > 0: black_timestamp = tagger_block_blacklist(db[blacklist_name], tgr) white_timestamp = str_DEFAULT_TIMESTAMP wlitems = ddef.get('whitelist_items', None) if len(whitelist_name) > 0 and wlitems: white_timestamp = tagger_add_whitelist(db[whitelist_name], tgr, entity_type, \ wlitems[0], wlitems[1]) logger.info('\tload chemical dictionary completed.') return tgr, black_timestamp, white_timestamp
def __init__(self, entities=None, names=None, globs=None, data_dir=None): # Defaults # if data_dir is None: data_dir = default_data_dir if entities is None: entities = data_dir + 'envo_entities.tsv' if names is None: names = data_dir + 'envo_names.tsv' if globs is None: globs = data_dir + 'envo_global.tsv' # Make an instance of the API # import tagger as tagger_api self.api = tagger_api.Tagger() self.api.LoadNames(entities, names) self.api.LoadGlobal(globs)
def predict(json_data, result): try: appid = json_data['appid'] sentence = json_data['sentence'] sentence = process_chinese_number(sentence) e = engines[appid] vf = vocab_file(appid, CLASSIFIER) _, v = dp.build_vocabulary(vf) labels = load_labels(appid) # jieba.load_userdict(vf) user_dict = load_dict(data_dir(appid) + "user_dict.txt") load_dict_for_jiaba(user_dict) reload = False if 'reload' in e: reload = e['reload'] if (CLASSIFIER not in e) or reload: e[CLASSIFIER] = cc.Classifier(v, appid, data_dir(appid) + 'classifier', labels) operation = e[CLASSIFIER].predict(sentence) if operation == '': return 'operation not gotten' logging.info("the operation is %s" % operation) category = appid + "_" + operation vf = vocab_file(appid, category) if (operation not in e) or reload: e[operation] = tagger.Tagger(data_file(appid, operation), vf, category, data_dir(appid) + category, user_dict) pairs = e[operation].determine(sentence) except Exception as e: logging.error(e) logging.debug(format_tb(e.__traceback__)) return e result['appid'] = appid result['operation'] = operation result['data'] = pairs return result
def __init__(self, backtrack, dictionary, mapping, threshold=0.0, do_backtrack=False, debug=False): self._debug = debug self._backtrack = backtrack self._do_backtrack = do_backtrack self._threshold = threshold self._dictionary = dictionary self._mapping = mapping self._pages = set() self._page_entity = {} self._tag = tagger.Tagger() self._max_tokens = 15 self._tag.LoadNames("%s_entities.tsv" % self._dictionary, "%s_names_expanded.tsv" % self._dictionary) self._entity_name = self.load_names()
def extract_event(self): self.dump_information() class TodoExtractor(Extractor): def __init__(self, toks): Extractor.__init__(self, toks) def extract_todo(self): self.dump_information() if __name__ == "__main__": common = Commonwords(commonwords_path) event_tagger = tagger.Tagger(classifier="event.ser.gz", port=1111) todo_tagger = tagger.Tagger(classifier="todo.ser.gz", port=2222) all_tagger = tagger.Tagger(classifier="all.ser.gz", port=3333) filenames = [ f for f in os.listdir(test_dir) if os.path.isfile(os.path.join(test_dir, f)) ] classifier = Classifier(all_tagger, common) for filename in filenames: if filename == "freq": continue msg = open(os.path.join(test_dir, filename), "r").read() print "++++++++++++++++++++++++++++++++++++++++" print msg print "++++++++++++++++++++++++++++++++++++++++"
import logging import tagger import geocoder from hasher import hashsong from laststuff import getartistinfo from icecream import ic as debug from flask import Flask, render_template, Response app = Flask(__name__) app.secret_key = "music" app.config["TESTING"] = True root = logging.getLogger() root.setLevel(logging.DEBUG) songsname = tagger.Tagger() def loadsong(): x = {} for song in songsname.scan(): m = hashsong(song) x[m.digest().hex()] = song return x mysong = loadsong() # debug(mysong) @app.route("/")
entry_filter = "" for o, a in opts: if o == "-t": threshold = float(a) elif o == "-m": mapping = a #if not os.path.isfile(corpus_filename): # print >> sys.stderr, "%s does not exits." % corpus_filename # sys.exit(2) elif o == "-d": dictionary = a elif o == "-f": dictionary_filter = a tag = tagger.Tagger() tag.LoadNames("%s_entities.tsv" % dictionary, "%s_names_expanded.tsv" % dictionary) #check ignore stuff entry_filter = {} if not os.path.isfile(dictionary_filter): for line in open(dictionary_filter): f = line[:-1].split("\t") entry_filter[f[0]] = f[1] serial_entity = {} for line in open("%s_entities.tsv" % dictionary): serial, t, entry = line[:-1].split("\t") serial_entity[serial] = entry
def train(data): try: json_data = json.loads(data.decode('utf-8')) appid = json_data['appid'] train_data = json_data['data'] if appid not in engines: engines[appid] = {} engines[appid]['reload'] = True logging.info("start train for %s" % appid) # ce base_dir = data_dir(appid) if not os.path.exists(base_dir): os.makedirs(base_dir) # 创建标签文件,保存分类标签 classifer_data = {} labels = list(train_data.keys()) logging.info("labels %s" % labels) lf = labels_file(appid) with open(lf, "w", encoding='utf-8') as f: data = train_data.keys() result = map(lambda x: x.strip() + "\n", data) f.writelines(result) user_dict = load_dict(data_dir(appid) + "user_dict.txt") load_dict_for_jiaba(user_dict) # 创建分类后的数据文件 data_list = [] for d in train_data.items(): operation = d[0] t_data = d[1] df = data_file(appid, operation) category = appid + "_" + operation logging.info("create tagger for %s" % category) data_list.append((df, category)) with open(df, "w", encoding='utf-8') as f: for line in t_data: f.write(line + "\r\n") pattern = re.compile(r'\[.*?\]') line = re.sub(pattern, '', line) classifer_data[line] = operation for data in data_list: df, category = data vf = vocab_file(appid, category) dp.create_vocabulary_from_data_file(vf, df) step = 2 t = tagger.Tagger(df, vf, category, data_dir(appid) + category, user_dict, step) t.train() vf = vocab_file(appid, CLASSIFIER) d = ".".join(list(classifer_data.keys())) lex, v = dp.create_vocabulary_from_data(vf, d, False) calssifier = cc.Classifier(v, appid, data_dir(appid) + 'classifier', labels) calssifier.train(classifer_data) except Exception as e: logging.error(e) logging.debug(format_tb(e.__traceback__)) return False return True
import tagger as tag """ Loading Corpus """ corpus_path = 'brown_corpus_modified' myTagger = tag.Tagger() tokenList = myTagger.load_corpus(corpus_path) """ Initializing Frequencies and then the Probabilities for: Initial Tagging, Transition, Emission """ for sentence in tokenList: myTagger.initialize_frequencies(sentence) myTagger.initialize_probabilities() # print("Initial Tag Probability: \n", myTagger.get_initial_tag_probability()) # # print("Transition Probability: \n", myTagger.get_transition_probability()) # # print("Emission Probability: \n", myTagger.get_emission_probability()) """ Testing the Tagging using Viterbi Decode for two sentences: 1. The Secretariat is expected to race tomorrow . 2. People continue to enquire the reason for the race for outer space . """ sentence1 = "The Secretariat is expected to race tomorrow ." sentence2 = "People continue to enquire the reason for the race for outer space ." print("Tagging of Sentence 1: ", myTagger.viterbi_decode(sentence1)) print("Tagging of Sentence 2: ", myTagger.viterbi_decode(sentence2))
import sys import torch import torch.autograd as autograd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data as Data import numpy as np import time import tagger as tagg start_time = time.time() if __name__ == '__main__': folder = sys.argv[1] tagger_training = tagg.Tagger(folder + '/train') tagger_training.define_all_data() tagger_dev = tagg.Tagger(folder + '/dev') tagg.model = tagg.Net(len(tagger_training.vocab), len(tagger_training.tags)) tagg.optimizer = optim.Adam(tagg.model.parameters(), lr=0.001) tagger_dev.define_x_y_not_train(tagger_training.vocab, tagger_training.tags_to_ix, True) tagger_test = tagg.Tagger(folder + '/test') tagger_test.define_x_y_not_train(tagger_training.vocab, tagger_training.tags_to_ix, False) losses = [] epochs = 30 data = [] for epoch in range(epochs): print "Start epoch " + str(epoch), str(tagg.passed_time(start_time))
import os import pickle import tagger datafile = os.path.join(os.path.dirname(__file__), '..', 'data/dict.pkl') # print datafile weights = pickle.load(open(datafile, 'rb')) rdr = tagger.Reader() stmr = tagger.Stemmer() rtr = tagger.Rater(weights) extract_tags = tagger.Tagger(rdr, stmr, rtr)
'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'skills' ] weights = pickle.load(open('jobtagdict.pkl', 'rb')) # or your own dictionary # weights2 = pickle.load(open('skilltagdict.pkl', 'rb')) weights2 = pickle.load(open('tagger/data/dict.pkl', 'rb')) myreader = tagger.Reader() # or your own reader class mystemmer = tagger.Stemmer() # or your own stemmer class myrater = tagger.Rater(weights) # or your own... (you got the idea) myrater2 = tagger.Rater(weights2) jobtagger = tagger.Tagger(myreader, mystemmer, myrater) skilltagger = tagger.Tagger(myreader, mystemmer, myrater2) # In[ ]: with open("stopwords.pkl") as f: l = JobListings(f.read()) save_object(l.listings, r'joblistings-cs.pkl') # In[46]: with open('jobtagcorpus.txt', 'wb') as f: for job in listings: f.write(job.desc) if getattr(job, 'qualifications', False): f.write(job.qualifications)
import torch.autograd as autograd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data as Data import numpy as np import time import tagger as tagg start_time = time.time() if __name__ == '__main__': folder = sys.argv[1] # TODO make it optionnal vecs_file = sys.argv[2] if len(sys.argv) > 2 else False vocab_file = sys.argv[3] if len(sys.argv) > 3 else False tagger_training = tagg.Tagger(folder + '/train', True) # if we use the embedding words if vecs_file: name = ".embed." tagger_training.define_all_data_from_file(vecs_file, vocab_file) else: name = "" tagger_training.define_all_data() tagger_training.create_sub_words() tagger_training.activate_sub_words(tagger_training.vecs) tagger_dev = tagg.Tagger(folder + '/dev', True) tagg.model = tagg.Net(len(tagger_training.vocab), len(tagger_training.tags)) tagg.model.embeddings.weight.data.copy_( torch.from_numpy(tagger_training.vecs)) tagg.optimizer = optim.Adam(tagg.model.parameters(), lr=0.001)
# install Goose https://github.com/grangier/python-goose # # Done so far: basic keyword extraction using tagger works. # # Concerns about keyword extraction using Tagger library: # https://github.com/apresta/tagger # - dictionary should be built from relevant corpi to article to be more # effective at attracting attention in immersive interface # - TF-IDF is a function provided in the module build_dict... if articles # in collection ever accumulate enough around one subject, use TF-IDF # # immediate todos: # - implement multitag from goose import Goose import tagger import pickle url = "http://www.theverge.com/2014/9/11/6136443/the-largest-predatory-dinosaur-ever-was-half-duck-half-crocodile" g = Goose() article = g.extract(url=url).cleaned_text weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary mytagger = tagger.Tagger(tagger.Reader(), tagger.Stemmer(), tagger.Rater(weights)) best_3_tags = mytagger(article, 6) print best_3_tags
import tagger except: sys.path.append("/home/app/tagger") import tagger ############################################################################# if __name__ == '__main__': parser = argparse.ArgumentParser( description='JensenLab Tagger NER utilities.') ops = ['tbe', 'tbe2'] parser.add_argument("op", choices=ops, help='operation') parser.add_argument("--i", required=True, dest="ifile", help="input (CSV|TSV|SSV|TXT)") parser.add_argument("--o", dest="ofile", help="output (CSV|TSV)") parser.add_argument("-v", "--verbose", default=0, action="count") args = parser.parse_args() fin = open(args.ifile) fout = open(args.ofile, "w") if args.ofile else sys.stdout tg = tagger.Tagger(java_script=None, re_stop=None, serials_only=False) doc = tg.load_local(args.ifile) #tg.get_entities(doc, docid, etypes) #tg.get_matches(doc, docid, etypes)