def getgoldevents(): zparModel = ZPar('english-models') #tagger = zparModel.get_tagger() depparser = zparModel.get_depparser() #stemmer = PorterStemmer() wordnet_lemmatizer = WordNetLemmatizer() gevents = file("../ni_data/event_descriptions.tsv").readlines() gevents = [line.strip().split("\t")[1].strip("\"") for line in gevents] gold_events = [] for line in gevents: parsed_sent = depparser.dep_parse_sentence(line) items = parsed_sent.strip().split("\n") items = [item.strip().split("\t") for item in items] words = [item[0] for item in items] tags = [item[1].lower() for item in items] links = [int(item[2]) for item in items] deps = [item[3].lower() for item in items] valid_words = [ words[idx] for idx, tag in enumerate(tags) if tag[:2] in ["nn", "vb", "jj", "cd", "rb"] if deps[idx] in ["root", "sub", "obj", "vc", "vmod", "nmod", "pmod"] ] #stemmed_words = [stemmer.stem(word.lower()) for word in valid_words if word not in ["is", "are", "a", "an", "be", "had", "ha"]] stemmed_words = [ wordnet_lemmatizer.lemmatize(word.lower()) for word in valid_words if word not in ["is", "are", "a", "an", "be", "had", "ha"] ] print "-gold", stemmed_words gold_events.append(list(set(stemmed_words))) return gold_events
def __init__(self, addr, zpar_model_path, model_list, *args, **kwds): # store the hostname and port number self.myhost, self.myport = addr # store the link to the loaded zpar object self.z = ZPar(zpar_model_path) # initialize the parent class _baseclass.__init__(self, addr, *args, **kwds) # Call the individual loading functions # and only register the appropriate methods if 'tagger' in model_list: tagger = self.z.get_tagger() self.register_function(tagger.tag_sentence) self.register_function(tagger.tag_file) if 'parser' in model_list: parser = self.z.get_parser() self.register_function(parser.parse_sentence) self.register_function(parser.parse_file) if 'depparser' in model_list: parser = self.z.get_depparser() self.register_function(parser.dep_parse_sentence) self.register_function(parser.dep_parse_file) # register the function to remotely stop the server self.register_function(self.stop_server) self.quit = False
def __init__(self, zpar_model_directory=None, hostname=None, port=None): """ Initialize the parser wrapper. Parameters ---------- zpar_model_directory : str, optional The path to the directory containing the ZPar constituency model. hostname : str, optional The name of the machine on which the ZPar server is running, if any. port : int, optional The port at which the ZPar server is running, if any. Raises ------ OSError If ZPar couldn't be loaded. """ self.zpar_model_directory = zpar_model_directory if self.zpar_model_directory is None: self.zpar_model_directory = os.getenv("ZPAR_MODEL_DIR", "zpar/english") # TODO: allow pre-tokenized input self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') self._zpar_proxy = None self._zpar_ref = None # if a port is specified, then we want to use the server if port: # if no hostname was specified, then try the local machine hostname = "localhost" if hostname is None else hostname logging.info(f"Connecting to zpar server at {hostname}:{port} ...") # see if a server actually exists connected, server_proxy = self._get_rpc(hostname, port) if connected: self._zpar_proxy = server_proxy else: logging.warning('Could not connect to zpar server.') # otherwise, we want to use the python zpar module else: logging.info("Trying to locate zpar shared library ...") try: # Create a zpar wrapper data structure z = ZPar(self.zpar_model_directory) self._zpar_ref = z.get_parser() except OSError as e: logging.warning("Could not load zpar via python-zpar." "Did you set `ZPAR_MODEL_DIR` correctly?") raise e
def setUp(): """ set up things we need for the tests """ global z, tagger assert 'ZPAR_MODEL_DIR' in os.environ model_dir = os.environ['ZPAR_MODEL_DIR'] z = ZPar(model_dir) tagger = z.get_tagger()
def setUp(): """ set up things we need for the tests """ global z, depparser assert 'ZPAR_MODEL_DIR' in os.environ model_dir = os.environ['ZPAR_MODEL_DIR'] z = ZPar(model_dir) depparser = z.get_depparser()
class StoppableServer(_baseclass): allow_reuse_address = True def __init__(self, addr, zpar_model_path, model_list, *args, **kwds): # store the hostname and port number self.myhost, self.myport = addr # store the link to the loaded zpar object self.z = ZPar(zpar_model_path) # initialize the parent class _baseclass.__init__(self, addr, *args, **kwds) # Call the individual loading functions # and only register the appropriate methods if 'tagger' in model_list: tagger = self.z.get_tagger() self.register_function(tagger.tag_sentence) self.register_function(tagger.tag_file) if 'parser' in model_list: parser = self.z.get_parser() self.register_function(parser.parse_sentence) self.register_function(parser.parse_file) self.register_function(parser.parse_tagged_sentence) self.register_function(parser.parse_tagged_file) if 'depparser' in model_list: parser = self.z.get_depparser() self.register_function(parser.dep_parse_sentence) self.register_function(parser.dep_parse_file) self.register_function(parser.dep_parse_tagged_sentence) self.register_function(parser.dep_parse_tagged_file) # register the function to remotely stop the server self.register_function(self.stop_server) self.quit = False def serve_forever(self): while not self.quit: try: self.handle_request() except KeyboardInterrupt: print("\nKeyboard interrupt received, exiting.") break self.z.close() self.server_close() def stop_server(self): self.quit = True return 0, "Server terminated on host %r, port %r" % (self.myhost, self.myport)
def main(): # set up an argument parser parser = argparse.ArgumentParser(prog='zpar_example.py') parser.add_argument( '--modeldir', dest='modeldir', help="Path to directory containing zpar English models", required=True) # parse given command line arguments args = parser.parse_args() # use the zpar wrapper as a context manager with ZPar(args.modeldir) as z: # get the parser and the dependency parser models tagger = z.get_tagger() depparser = z.get_depparser() # tag a sentence tagged_sent = tagger.tag_sentence("I am going to the market.") print_(tagged_sent) # tag an already tokenized sentence tagged_sent = tagger.tag_sentence( "Do n't you want to come with me to the market ?", tokenize=False) print_(tagged_sent) # get the dependency parse of an already tagged sentence dep_parsed_sent = depparser.dep_parse_tagged_sentence( "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.") print_(dep_parsed_sent) # get the dependency parse of an already tokenized sentence dep_parsed_sent = depparser.dep_parse_sentence( "Do n't you want to come with me to the market ?", tokenize=False) print_(dep_parsed_sent) # get the dependency parse of an already tokenized sentence # and include lemma information (assuming you have NLTK as well # as its WordNet corpus installed) dep_parsed_sent = depparser.dep_parse_sentence( "Do n't you want to come with me to the market ?", tokenize=False, with_lemmas=True) print_(dep_parsed_sent) # compute POS tags for all sentences in "test.txt" # and write the output to "test.tag". Note that the # file contains a single sentence per line. # The sentences need not be word tokenized test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test.txt') tagger.tag_file(test_file, "test.tag") # compute dependency parses for all sentences in "test_tokenized.txt" tokenized_test_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'test_tokenized.txt') depparser.dep_parse_file(tokenized_test_file, "test.dep")
def __init__(self, zpar_model_directory=None, hostname=None, port=None): self.zpar_model_directory = zpar_model_directory if self.zpar_model_directory is None: self.zpar_model_directory = os.getenv('ZPAR_MODEL_DIR', 'zpar/english') self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') self._zpar_proxy = None self._zpar_ref = None # if a port is specified, then we want to use the server if port: # if no hostname was specified, then try the local machine hostname = 'localhost' if not hostname else hostname logging.info('Trying to connect to zpar server at {}:{} ...' .format(hostname, port)) # try to see if a server actually exists connected, server_proxy = self._get_rpc(hostname, port) if connected: self._zpar_proxy = server_proxy else: logging.warning('Could not connect to zpar server') # otherwise, we want to use the python zpar module else: logging.info('Trying to locate zpar shared library ...') # get the path to the zpar shared library via the environment # variable zpar_library_dir = os.getenv('ZPAR_LIBRARY_DIR', '') zpar_library_path = os.path.join(zpar_library_dir, 'zpar.so') try: # Create a zpar wrapper data structure z = ZPar(self.zpar_model_directory) self._zpar_ref = z.get_parser() except OSError as e: logging.warning('Could not load zpar via python-zpar. ' + 'Did you set ZPAR_LIBRARY_DIR correctly?' + 'Did you set ZPAR_MODEL_DIR correctly?') raise e
def read_data_use(option, sen2id): file_name = option.use_data_path max_length = option.num_steps dict_size = option.dict_size time1 = time.time() Rake = RAKE.Rake(RAKE.SmartStopList()) z = ZPar(option.pos_path) tagger = z.get_tagger() time2 = time.time() print("read data load time: ", time2 - time1) with open(file_name) as f: data = [] vector = [] sta_vec_list = [] j = 0 for line in f: if len(line.strip().split()) > 15: line = ' '.join(line.strip().split()[:15]) sta_vec = list(np.zeros([option.num_steps - 1])) keyword = Rake.run(line.strip()) pos_list = tagger.tag_sentence(line.strip()).split() pos = list(zip(*[x.split('/') for x in pos_list]))[0] # pos=list(zip(*[x.split('/') for x in pos_list]))[0] if keyword != []: keyword = list(list(zip(*keyword))[0]) keyword_new = [] linewords = line.strip().split() for i in range(len(linewords)): for item in keyword: length11 = len(item.split()) if ' '.join(linewords[i:i + length11]) == item: keyword_new.extend( [i + k for k in range(length11)]) for i in range(len(keyword_new)): ind = keyword_new[i] if ind <= option.num_steps - 2: sta_vec[ind] = 1 if option.keyword_pos == True: sta_vec_list.append(keyword_pos2sta_vec(option, sta_vec, pos)) else: sta_vec_list.append(list(np.zeros([option.num_steps - 1]))) data.append(sen2id(line.strip().lower().split())) data_new = array_data(data, max_length, dict_size) return data_new, sta_vec_list # sentence, keyvector
def read_data_use1(option, sen2id): file_name = option.use_data_path max_length = option.num_steps dict_size = option.dict_size Rake = RAKE.Rake(RAKE.SmartStopList()) z = ZPar(option.pos_path) tagger = z.get_tagger() with open(file_name) as f: data = [] vector = [] sta_vec_list = [] j = 0 for line in f: print('sentence:' + line) sta_vec = list(np.zeros([option.num_steps - 1])) keyword = Rake.run(line.strip()) pos_list = tagger.tag_sentence(line.strip()).split() # pos=zip(*[x.split('/') for x in pos_list])[0] pos = list(zip(*[x.split('/') for x in pos_list]))[0] print(keyword) if keyword != []: keyword = list(list(zip(*keyword))[0]) keyword_new = [] for item in keyword: tem1 = [ line.strip().split().index(x) for x in item.split() if x in line.strip().split() ] print('id', tem1) keyword_new.extend(tem1) print(keyword_new) for i in range(len(keyword_new)): ind = keyword_new[i] if ind <= option.num_steps - 2: sta_vec[ind] = 1 if option.keyword_pos == True: sta_vec_list.append(keyword_pos2sta_vec(option, sta_vec, pos)) else: sta_vec_list.append(list(np.zeros([option.num_steps - 1]))) print(keyword_pos2sta_vec(option, sta_vec, pos)) data.append(sen2id(line.strip().lower().split())) data_new = array_data(data, max_length, dict_size) return data_new, sta_vec_list # sentence, keyvector
def main(giga_db_loc, n_docs, pos_tag=False, parse=False): docs = Gigaword(giga_db_loc, limit=n_docs) sbd = nltk.data.load('tokenizers/punkt/english.pickle') from zpar import ZPar n = 0 with ZPar('models/zpar') as z: tagger = z.get_tagger() if parse: parser = z.get_depparser() for doc in docs: sentences = sbd.tokenize(doc) for sent in sentences: if parse: dep_parsed_sent = parser.dep_parse_sentence(sent) elif pos_tag: tags = parser.tag_sentence(sent) n += len(sent) print n
from zpar import ZPar if __name__ == '__main__': # set up an argument parser parser = argparse.ArgumentParser(prog='zpar_example.py') parser.add_argument( '--modeldir', dest='modeldir', help="Path to directory containing zpar English models", required=True) # parse given command line arguments args = parser.parse_args() # use the zpar wrapper as a context manager with ZPar(args.modeldir) as z: # get the parser and the dependency parser models tagger = z.get_tagger() depparser = z.get_depparser() # tag a sentence tagged_sent = tagger.tag_sentence("I am going to the market.") print_(tagged_sent) # tag an already tokenized sentence tagged_sent = tagger.tag_sentence( "Do n't you want to come with me to the market ?", tokenize=False) print_(tagged_sent) # get the dependency parse of an already tagged sentence
from six import print_ from zpar import ZPar # use the zpar wrapper as a context manager with ZPar('english-models') as z: # get the parser and the dependency parser models tagger = z.get_tagger() depparser = z.get_depparser() parser = z.get_parser() # tag a sentence tagged_sent = tagger.tag_sentence("I am going to the market.") print_(tagged_sent) # tag an already tokenized sentence tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False) print_(tagged_sent) # get the dependency parse of an already tagged sentence dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.") print_(dep_parsed_sent) # get the dependency parse of an already tokenized sentence dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False) print_(dep_parsed_sent) # get the dependency parse of an already tokenized sentence # and include lemma information (assuming you have NLTK as well # as its WordNet corpus installed) dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False, with_lemmas=True)
def extractStockNews(stock_newsDir, symCompHash, sentNUM): snp_comp = symCompHash.values() zparModel = ZPar('english-models') #tagger = zparModel.get_tagger() depparser = zparModel.get_depparser() stemmer = PorterStemmer() dayNews = [] for dayDir in sorted(os.listdir(stock_newsDir)): if len(dayDir) != 10: continue #if int(dayDir[-2:]) > 5: continue #if dayDir != "2015-04-30": continue newsContents = set() for newsfile in sorted(os.listdir(stock_newsDir + dayDir)): #print "##############################################################" content = open(stock_newsDir + dayDir + "/" + newsfile, "r").read() printable = set(string.printable) content = filter(lambda x: x in printable, content) #print content #print "##############################################################" #sents = get_valid_news_content(content) sents = get_valid_1stpara_news(content) if sents is None: continue headline = re.sub("^(rpt )?update\s*\d+\s", "", "###".join(sents[:sentNUM]).lower()) headline = re.sub("\s+", " ", headline) newsContents.add(headline) oneDayNews = [] # [(matchedSNPComp, headline), ...] # matchedSNPComp: [(matchedPart, WholeCompName), ...] fullNameNum = 0 doubtCounter = Counter() if 0: print "\n".join(sorted(list(newsContents))) continue newsHash = {} headlineCompHash = {} for headline in newsContents: fullMatch = findComp_name(headline.replace("###", " "), snp_comp) #symMatch = [(word, symCompHash[word]) for word in headline.replace("###", " ").split() if word in symCompHash and word not in ["a", "an", "has"]] symMatch = [ word for word in headline.replace("###", " ").split() if word in [ "ge", "gt", "gm", "aig", "cvs", "oi", "adm", "jpm", "twc", "cvc", "se" ] ] symMatch = list([symCompHash[sym] for sym in set(symMatch)]) if fullMatch is not None or len(symMatch) > 0: if 0: print "---------------------------" print fullMatch, symMatch print headline continue if fullMatch is not None: symMatch.extend(fullMatch) headlineCompHash[headline] = symMatch # get valid words in headline parsed_sents = [ depparser.dep_parse_sentence(sent) for sent in headline.split("###") ] triples = frmTriple(parsed_sents, None) triples = [ stemmer.stem(word) for word in triples if word not in [":", "(", ")", ",", ".", "\"", "'"] ] sortedText = " ".join(sorted(triples)) if sortedText not in newsHash: newsHash[sortedText] = headline for impText, headline in newsHash.items(): fullNameNum += 1 oneDayNews.append( (headlineCompHash[headline], headline, impText.split())) #doubtMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] > 0.33 and matchScore[idx] < 0.66] #wrongMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] <= 0.33] #print "full", fullNameNum, len(newsContents), round(float(fullNameNum)/len(newsContents), 2) print "## Stock news extracting done in day", dayDir, " #snp_matched", fullNameNum, " out of all", len( newsContents), time.asctime() dayNews.append(oneDayNews) #break return dayNews
from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import pickle as pkl from config import config config = config() from utils import * import sys sys.path.insert(0, config.dict_path) from dict_use import * import RAKE Rake = RAKE.Rake(RAKE.SmartStopList()) from zpar import ZPar z = ZPar(config.pos_path) tagger = z.get_tagger() tt_proportion = 0.9 class dataset(object): def __init__(self, input, sequence_length, target): self.input = input self.target = target self.sequence_length = sequence_length self.length = len(input) def __call__(self, batch_size, step): batch_num = self.length // batch_size step = step % batch_num return self.input[step * batch_size:(step + 1) * batch_size], self.sequence_length[
def extractStockNews(stock_newsDir, symCompHash, sentNUM): snp_comp = symCompHash.values() zparModel = ZPar('english-models') #tagger = zparModel.get_tagger() depparser = zparModel.get_depparser() stemmer = PorterStemmer() dayNews = [] for dayDir in sorted(os.listdir(stock_newsDir)): if len(dayDir) != 10: continue #if int(dayDir[-2:]) > 5: continue #if dayDir != "2015-04-30": continue newsContents = set() for newsfile in sorted(os.listdir(stock_newsDir + dayDir)): #print "##############################################################" content = open(stock_newsDir + dayDir + "/" + newsfile, "r").read() printable = set(string.printable) content = filter(lambda x:x in printable, content) #print content #print "##############################################################" #sents = get_valid_news_content(content) sents = get_valid_1stpara_news(content) if sents is None: continue headline = re.sub("^(rpt )?update\s*\d+\s", "", "###".join(sents[:sentNUM]).lower()) headline = re.sub("\s+", " ", headline) newsContents.add(headline) oneDayNews = [] # [(matchedSNPComp, headline), ...] # matchedSNPComp: [(matchedPart, WholeCompName), ...] fullNameNum = 0 doubtCounter = Counter() if 0: print "\n".join(sorted(list(newsContents))) continue newsHash = {} headlineCompHash = {} for headline in newsContents: fullMatch = findComp_name(headline.replace("###", " "), snp_comp) #symMatch = [(word, symCompHash[word]) for word in headline.replace("###", " ").split() if word in symCompHash and word not in ["a", "an", "has"]] symMatch = [word for word in headline.replace("###", " ").split() if word in ["ge", "gt", "gm", "aig", "cvs", "oi", "adm", "jpm", "twc", "cvc", "se"]] symMatch = list([symCompHash[sym] for sym in set(symMatch)]) if fullMatch is not None or len(symMatch) > 0: if 0: print "---------------------------" print fullMatch, symMatch print headline continue if fullMatch is not None: symMatch.extend(fullMatch) headlineCompHash[headline] = symMatch # get valid words in headline parsed_sents = [depparser.dep_parse_sentence(sent) for sent in headline.split("###")] triples = frmTriple(parsed_sents, None) triples = [stemmer.stem(word) for word in triples if word not in [":", "(", ")", ",", ".", "\"", "'"]] sortedText = " ".join(sorted(triples)) if sortedText not in newsHash: newsHash[sortedText] = headline for impText, headline in newsHash.items(): fullNameNum += 1 oneDayNews.append((headlineCompHash[headline], headline, impText.split())) #doubtMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] > 0.33 and matchScore[idx] < 0.66] #wrongMatch = [matchedComp[idx] for idx in range(len(matchedComp)) if matchScore[idx] <= 0.33] #print "full", fullNameNum, len(newsContents), round(float(fullNameNum)/len(newsContents), 2) print "## Stock news extracting done in day", dayDir, " #snp_matched", fullNameNum, " out of all", len(newsContents), time.asctime() dayNews.append(oneDayNews) #break return dayNews