Esempio n. 1
0
def main():
    sv_tags = [
        "<ROOT>", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN",
        "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB"
    ]
    en_tags = [
        "<ROOT>", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN",
        "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
    ]
    arc_tags = [
        '<ROOT>', 'name', 'nsubjpass', 'dobj', 'acl', 'advcl', 'advmod',
        'amod', 'appos', 'aux', 'case', 'cc', 'ccomp', 'clf', 'compound',
        'conj', 'cop', 'csubj', 'dep', 'det', 'discourse', 'det', 'discourse',
        'dislocated', 'expl', 'fixed', 'flat', 'goeswith', 'iobj', 'list',
        'mark', 'nmod', 'nsubj', 'nummod', 'obj', 'obl', 'orphan', 'parataxis',
        'punct', 'reparandum', 'root', 'vocative', 'xcomp'
    ]

    sv_train_file = "treebanks/sv_train.conllu"
    sv_test_file = "treebanks/sv_dev.conllu"
    en_train_file = "treebanks/en_train.conllu"
    en_test_file = "treebanks/en_dev.conllu"

    #model = wsm.WSM()
    #model.create_model(en_train_file)

    model_path = 'models/wsm_en'
    myTagger = tagger.Tagger(en_tags)
    arc_tagger = tagger.Tagger(arc_tags)
    myParser = parser.Parser(myTagger, arc_tagger, model_path)
    dataReader.evaluate(en_train_file, en_test_file, myParser)
    dataReader.evaluate(sv_train_file, sv_test_file, myParser)
Esempio n. 2
0
 def __init__(self,
              backtrack,
              dictionary,
              mapping,
              threshold=0.0,
              do_backtrack=False,
              debug=False,
              type_weight={
                  'firsttitle': 3,
                  'title': 2,
                  'text': 1
              }):
     self._debug = debug
     self._backtrack = backtrack
     self._do_backtrack = do_backtrack
     self._threshold = threshold
     self._dictionary = dictionary
     self._mapping = mapping
     self._pages = set()
     self._type_entity_page = {}
     self._type_page_entity_count = {}
     self._page_entity_synonyms = {}
     self._tag = tagger.Tagger()
     self._max_tokens = 15
     self._type_weight = type_weight
     self._tag.LoadNames("%s_entities.tsv" % self._dictionary,
                         "%s_names_expanded.tsv" % self._dictionary)
     self._entity_name = self.load_names()
Esempio n. 3
0
def create_protein_engine(ddef, context):
    db = context['db']
    logger = context['logger']
    logger.info('preparing protein engine...')
    dict_c_name = ddef['dictionary_collection']
    logger.info('\tdictionary collection name is: ' + dict_c_name)
    blacklist_name = ddef['blacklist']
    logger.info('\tblacklist collection name is: ' + blacklist_name)
    whitelist_name = ddef['whitelist']
    logger.info('\twhitelist collection name is: ' + whitelist_name)
    entity_type = ddef['entity_type']

    tgr = tagger.Tagger()
    for edict in db[dict_c_name].find():
        for entry in edict['dictionary']:
            p_a = entry[PRIMARY_ACCESSION].encode("utf-8")
            for word in entry['words']:
                tgr.add_name(word.encode("utf-8"), entity_type, p_a)

    black_timestamp = tagger_block_blacklist(db[blacklist_name], tgr)
    white_timestamp = str_DEFAULT_TIMESTAMP
    wlitems = ddef.get('whitelist_items', None)
    if len(whitelist_name) > 0 and wlitems:
        white_timestamp = tagger_add_whitelist(db[whitelist_name], tgr, entity_type,  \
            wlitems[0], wlitems[1])
    return tgr, black_timestamp, white_timestamp
Esempio n. 4
0
def loadsong():
    x = {}
    for song in tagger.Tagger().scan():
        m = hashlib.sha256()
        m.update(song.link.encode("utf-8"))
        x[m.digest()] = song
    return x
Esempio n. 5
0
def create_chemical_engine(ddef, context):
    db = context['db']
    logger = context['logger']
    logger.info('preparing chemical engine...')
    dict_c_name = ddef['dictionary_collection']
    logger.info('\tdictionary collection name is: ' + dict_c_name)
    blacklist_name = ddef['blacklist']
    logger.info('\tblacklist collection name is: ' + blacklist_name)
    whitelist_name = ddef['whitelist']
    logger.info('\twhitelist collection name is: ' + whitelist_name)
    entity_type = ddef['entity_type']
    tgr = tagger.Tagger()
    if len(dict_c_name) > 0:
        for entry in db[dict_c_name].find():
            c_key = entry['key'].encode("utf-8")
            for word in entry['words']:
                tgr.add_name(word.encode("utf-8"), entity_type, c_key)
    black_timestamp = str_DEFAULT_TIMESTAMP
    if len(blacklist_name) > 0:
        black_timestamp = tagger_block_blacklist(db[blacklist_name], tgr)
    white_timestamp = str_DEFAULT_TIMESTAMP
    wlitems = ddef.get('whitelist_items', None)
    if len(whitelist_name) > 0 and wlitems:
        white_timestamp = tagger_add_whitelist(db[whitelist_name], tgr, entity_type, \
                wlitems[0], wlitems[1])
    logger.info('\tload chemical dictionary completed.')
    return tgr, black_timestamp, white_timestamp
Esempio n. 6
0
 def __init__(self, entities=None, names=None, globs=None, data_dir=None):
     # Defaults #
     if data_dir is None: data_dir = default_data_dir
     if entities is None: entities = data_dir + 'envo_entities.tsv'
     if names is None: names = data_dir + 'envo_names.tsv'
     if globs is None: globs = data_dir + 'envo_global.tsv'
     # Make an instance of the API #
     import tagger as tagger_api
     self.api = tagger_api.Tagger()
     self.api.LoadNames(entities, names)
     self.api.LoadGlobal(globs)
Esempio n. 7
0
def predict(json_data, result):
    try:
        appid = json_data['appid']
        sentence = json_data['sentence']
        sentence = process_chinese_number(sentence)
        e = engines[appid]
        vf = vocab_file(appid, CLASSIFIER)
        _, v = dp.build_vocabulary(vf)
        labels = load_labels(appid)
        # jieba.load_userdict(vf)

        user_dict = load_dict(data_dir(appid) + "user_dict.txt")
        load_dict_for_jiaba(user_dict)

        reload = False
        if 'reload' in e:
            reload = e['reload']
        if (CLASSIFIER not in e) or reload:
            e[CLASSIFIER] = cc.Classifier(v, appid,
                                          data_dir(appid) + 'classifier',
                                          labels)

        operation = e[CLASSIFIER].predict(sentence)

        if operation == '':
            return 'operation not gotten'

        logging.info("the operation is %s" % operation)

        category = appid + "_" + operation
        vf = vocab_file(appid, category)
        if (operation not in e) or reload:
            e[operation] = tagger.Tagger(data_file(appid, operation), vf,
                                         category,
                                         data_dir(appid) + category, user_dict)

        pairs = e[operation].determine(sentence)

    except Exception as e:
        logging.error(e)
        logging.debug(format_tb(e.__traceback__))
        return e

    result['appid'] = appid
    result['operation'] = operation
    result['data'] = pairs
    return result
Esempio n. 8
0
 def __init__(self,
              backtrack,
              dictionary,
              mapping,
              threshold=0.0,
              do_backtrack=False,
              debug=False):
     self._debug = debug
     self._backtrack = backtrack
     self._do_backtrack = do_backtrack
     self._threshold = threshold
     self._dictionary = dictionary
     self._mapping = mapping
     self._pages = set()
     self._page_entity = {}
     self._tag = tagger.Tagger()
     self._max_tokens = 15
     self._tag.LoadNames("%s_entities.tsv" % self._dictionary,
                         "%s_names_expanded.tsv" % self._dictionary)
     self._entity_name = self.load_names()
Esempio n. 9
0
    def extract_event(self):
        self.dump_information()


class TodoExtractor(Extractor):
    def __init__(self, toks):
        Extractor.__init__(self, toks)

    def extract_todo(self):
        self.dump_information()


if __name__ == "__main__":
    common = Commonwords(commonwords_path)

    event_tagger = tagger.Tagger(classifier="event.ser.gz", port=1111)
    todo_tagger = tagger.Tagger(classifier="todo.ser.gz", port=2222)
    all_tagger = tagger.Tagger(classifier="all.ser.gz", port=3333)

    filenames = [
        f for f in os.listdir(test_dir)
        if os.path.isfile(os.path.join(test_dir, f))
    ]
    classifier = Classifier(all_tagger, common)
    for filename in filenames:
        if filename == "freq":
            continue
        msg = open(os.path.join(test_dir, filename), "r").read()
        print "++++++++++++++++++++++++++++++++++++++++"
        print msg
        print "++++++++++++++++++++++++++++++++++++++++"
Esempio n. 10
0
import logging
import tagger
import geocoder
from hasher import hashsong
from laststuff import getartistinfo
from icecream import ic as debug
from flask import Flask, render_template, Response

app = Flask(__name__)
app.secret_key = "music"
app.config["TESTING"] = True

root = logging.getLogger()
root.setLevel(logging.DEBUG)

songsname = tagger.Tagger()


def loadsong():
    x = {}
    for song in songsname.scan():
        m = hashsong(song)
        x[m.digest().hex()] = song
    return x


mysong = loadsong()
# debug(mysong)


@app.route("/")
Esempio n. 11
0
entry_filter = ""

for o, a in opts:
    if o == "-t":
        threshold = float(a)
    elif o == "-m":
        mapping = a
        #if not os.path.isfile(corpus_filename):
        #	print >> sys.stderr, "%s does not exits." % corpus_filename
        #	sys.exit(2)
    elif o == "-d":
        dictionary = a
    elif o == "-f":
        dictionary_filter = a

tag = tagger.Tagger()
tag.LoadNames("%s_entities.tsv" % dictionary,
              "%s_names_expanded.tsv" % dictionary)

#check ignore stuff

entry_filter = {}
if not os.path.isfile(dictionary_filter):
    for line in open(dictionary_filter):
        f = line[:-1].split("\t")
        entry_filter[f[0]] = f[1]

serial_entity = {}
for line in open("%s_entities.tsv" % dictionary):
    serial, t, entry = line[:-1].split("\t")
    serial_entity[serial] = entry
Esempio n. 12
0
def train(data):
    try:
        json_data = json.loads(data.decode('utf-8'))
        appid = json_data['appid']
        train_data = json_data['data']
        if appid not in engines:
            engines[appid] = {}
        engines[appid]['reload'] = True
        logging.info("start train for %s" % appid)
        # ce
        base_dir = data_dir(appid)
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        # 创建标签文件,保存分类标签
        classifer_data = {}
        labels = list(train_data.keys())
        logging.info("labels %s" % labels)

        lf = labels_file(appid)
        with open(lf, "w", encoding='utf-8') as f:
            data = train_data.keys()
            result = map(lambda x: x.strip() + "\n", data)
            f.writelines(result)

        user_dict = load_dict(data_dir(appid) + "user_dict.txt")
        load_dict_for_jiaba(user_dict)

        # 创建分类后的数据文件
        data_list = []
        for d in train_data.items():
            operation = d[0]
            t_data = d[1]
            df = data_file(appid, operation)
            category = appid + "_" + operation
            logging.info("create tagger for %s" % category)
            data_list.append((df, category))

            with open(df, "w", encoding='utf-8') as f:
                for line in t_data:
                    f.write(line + "\r\n")
                    pattern = re.compile(r'\[.*?\]')
                    line = re.sub(pattern, '', line)
                    classifer_data[line] = operation

        for data in data_list:
            df, category = data
            vf = vocab_file(appid, category)
            dp.create_vocabulary_from_data_file(vf, df)
            step = 2
            t = tagger.Tagger(df, vf, category,
                              data_dir(appid) + category, user_dict, step)
            t.train()

        vf = vocab_file(appid, CLASSIFIER)
        d = ".".join(list(classifer_data.keys()))
        lex, v = dp.create_vocabulary_from_data(vf, d, False)
        calssifier = cc.Classifier(v, appid,
                                   data_dir(appid) + 'classifier', labels)
        calssifier.train(classifer_data)
    except Exception as e:
        logging.error(e)
        logging.debug(format_tb(e.__traceback__))
        return False

    return True
Esempio n. 13
0
import tagger as tag
"""
Loading Corpus
"""
corpus_path = 'brown_corpus_modified'
myTagger = tag.Tagger()
tokenList = myTagger.load_corpus(corpus_path)
"""
Initializing Frequencies and then the Probabilities for:
Initial Tagging, Transition, Emission
"""
for sentence in tokenList:
    myTagger.initialize_frequencies(sentence)
myTagger.initialize_probabilities()
# print("Initial Tag Probability: \n", myTagger.get_initial_tag_probability())
#
# print("Transition Probability: \n", myTagger.get_transition_probability())
#
# print("Emission Probability: \n", myTagger.get_emission_probability())
"""
Testing the Tagging using Viterbi Decode for two sentences:
1. The Secretariat is expected to race tomorrow .
2. People continue to enquire the reason for the race for outer space .
"""
sentence1 = "The Secretariat is expected to race tomorrow ."
sentence2 = "People continue to enquire the reason for the race for outer space ."
print("Tagging of Sentence 1: ", myTagger.viterbi_decode(sentence1))
print("Tagging of Sentence 2: ", myTagger.viterbi_decode(sentence2))
Esempio n. 14
0
import sys
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import numpy as np
import time
import tagger as tagg
start_time = time.time()

if __name__ == '__main__':
    folder = sys.argv[1]
    tagger_training = tagg.Tagger(folder + '/train')
    tagger_training.define_all_data()
    tagger_dev = tagg.Tagger(folder + '/dev')
    tagg.model = tagg.Net(len(tagger_training.vocab),
                          len(tagger_training.tags))
    tagg.optimizer = optim.Adam(tagg.model.parameters(), lr=0.001)
    tagger_dev.define_x_y_not_train(tagger_training.vocab,
                                    tagger_training.tags_to_ix, True)
    tagger_test = tagg.Tagger(folder + '/test')
    tagger_test.define_x_y_not_train(tagger_training.vocab,
                                     tagger_training.tags_to_ix, False)

    losses = []
    epochs = 30
    data = []
    for epoch in range(epochs):
        print "Start epoch " + str(epoch), str(tagg.passed_time(start_time))
Esempio n. 15
0
import os
import pickle
import tagger

datafile = os.path.join(os.path.dirname(__file__), '..', 'data/dict.pkl')
# print datafile
weights = pickle.load(open(datafile, 'rb'))
rdr = tagger.Reader()
stmr = tagger.Stemmer()
rtr = tagger.Rater(weights)

extract_tags = tagger.Tagger(rdr, stmr, rtr)
Esempio n. 16
0
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
    'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
    'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',
    'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'skills'
]

weights = pickle.load(open('jobtagdict.pkl', 'rb'))  # or your own dictionary
# weights2 = pickle.load(open('skilltagdict.pkl', 'rb'))
weights2 = pickle.load(open('tagger/data/dict.pkl', 'rb'))
myreader = tagger.Reader()  # or your own reader class
mystemmer = tagger.Stemmer()  # or your own stemmer class
myrater = tagger.Rater(weights)  # or your own... (you got the idea)
myrater2 = tagger.Rater(weights2)
jobtagger = tagger.Tagger(myreader, mystemmer, myrater)
skilltagger = tagger.Tagger(myreader, mystemmer, myrater2)

# In[ ]:

with open("stopwords.pkl") as f:
    l = JobListings(f.read())
    save_object(l.listings, r'joblistings-cs.pkl')

# In[46]:

with open('jobtagcorpus.txt', 'wb') as f:
    for job in listings:
        f.write(job.desc)
        if getattr(job, 'qualifications', False):
            f.write(job.qualifications)
Esempio n. 17
0
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import numpy as np
import time
import tagger as tagg
start_time = time.time()

if __name__ == '__main__':
    folder = sys.argv[1]
    # TODO make it optionnal
    vecs_file = sys.argv[2] if len(sys.argv) > 2 else False
    vocab_file = sys.argv[3] if len(sys.argv) > 3 else False
    tagger_training = tagg.Tagger(folder + '/train', True)
    # if we use the embedding words
    if vecs_file:
        name = ".embed."
        tagger_training.define_all_data_from_file(vecs_file, vocab_file)
    else:
        name = ""
        tagger_training.define_all_data()
    tagger_training.create_sub_words()
    tagger_training.activate_sub_words(tagger_training.vecs)
    tagger_dev = tagg.Tagger(folder + '/dev', True)
    tagg.model = tagg.Net(len(tagger_training.vocab),
                          len(tagger_training.tags))
    tagg.model.embeddings.weight.data.copy_(
        torch.from_numpy(tagger_training.vecs))
    tagg.optimizer = optim.Adam(tagg.model.parameters(), lr=0.001)
Esempio n. 18
0
# install Goose https://github.com/grangier/python-goose
#
# Done so far: basic keyword extraction using tagger works.
#
# Concerns about keyword extraction using Tagger library:
# https://github.com/apresta/tagger
# - dictionary should be built from relevant corpi to article to be more
# 	effective at attracting attention in immersive interface
# - TF-IDF is a function provided in the module build_dict... if articles
# 	in collection ever accumulate enough around one subject, use TF-IDF
#
# immediate todos:
# - implement multitag

from goose import Goose
import tagger
import pickle

url = "http://www.theverge.com/2014/9/11/6136443/the-largest-predatory-dinosaur-ever-was-half-duck-half-crocodile"
g = Goose()
article = g.extract(url=url).cleaned_text

weights = pickle.load(open('data/dict.pkl', 'rb'))  # or your own dictionary
mytagger = tagger.Tagger(tagger.Reader(), tagger.Stemmer(),
                         tagger.Rater(weights))
best_3_tags = mytagger(article, 6)
print best_3_tags
Esempio n. 19
0
    import tagger
except:
    sys.path.append("/home/app/tagger")
    import tagger

#############################################################################
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='JensenLab Tagger NER utilities.')
    ops = ['tbe', 'tbe2']
    parser.add_argument("op", choices=ops, help='operation')
    parser.add_argument("--i",
                        required=True,
                        dest="ifile",
                        help="input (CSV|TSV|SSV|TXT)")
    parser.add_argument("--o", dest="ofile", help="output (CSV|TSV)")
    parser.add_argument("-v", "--verbose", default=0, action="count")
    args = parser.parse_args()

    fin = open(args.ifile)

    fout = open(args.ofile, "w") if args.ofile else sys.stdout

    tg = tagger.Tagger(java_script=None, re_stop=None, serials_only=False)

    doc = tg.load_local(args.ifile)

    #tg.get_entities(doc, docid, etypes)

    #tg.get_matches(doc, docid, etypes)