class SynsetExample(object): def __init__(self): self.synset_example = {} self.tokenizer = CoreNLPParser(url='http://localhost:42636') self.use_babelnet = use_extended_gloss if self.use_babelnet: from py4j.java_gateway import JavaGateway gateway = JavaGateway() self.sense = gateway.entry_point def __getitem__(self, name): if name not in self.synset_example: self.synset_example[name] = self.get_synset_example(name) return self.synset_example[name] def get_synset_example(self, name): synset = wn.synset(name) if self.use_babelnet: synset_pos = synset.pos() if synset_pos == "s": synset_pos = "a" synset_id = 'wn:{}{}'.format( str(synset.offset()).zfill(8), synset_pos) example = self.sense.getExampleByWnSynsetId(synset_id) if not example: example = " ".join(synset.examples()).strip() else: example = " ".join(synset.examples()).strip() return [x.lower() for x in self.tokenizer.tokenize(example)]
def create_dataset_bin(annotation_file, data_file): parser = CoreNLPParser(url='http://localhost:9080') dirname = os.path.dirname(os.path.realpath(__file__)) + "/" dataset = [] with open(annotation_file, "r") as file1, open(data_file, "r") as file2: for line_from_file_1, line_from_file_2 in zip(file1, file2): output = None line1 = line_from_file_1.split() line2 = line_from_file_2 if line1[0] == "ne": output = 7 elif line1[0] == "hp": output = 0 elif line1[0] == "sd": output = 1 elif line1[0] == "ag": output = 2 elif line1[0] == "dg": output = 3 elif line1[0] == "sp": output = 4 elif line1[0] == "fr": output = 5 elif line1[0] == "me": output = 6 dataset.append((output, list(parser.tokenize(line2)))) print(len(dataset)) with open(dirname + "Pickle/dataset_ready", 'wb') as outfile: cPickle.dump(dataset, outfile)
def start_testing(trained_model_file): parser = CoreNLPParser(url='http://localhost:9080') emotions = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear'] dirname = os.path.dirname(os.path.realpath(__file__)) + "/" glove_model = read_glove_vectors(dirname + "Pickle/gloveModel") hidden_size = 256 num_layers = 2 bidirectional = False batchnorm = False dropout_hidden = 0.3 dropout_output = 0.9 model = LSTM(300, hidden_size, num_layers, bidirectional, batchnorm, dropout_hidden, dropout_output).to(device) with torch.no_grad(): model.load_state_dict(torch.load(trained_model_file)) print(model) model.eval() while True: test_sentence = input("Give a test sentence: ") sentence = list(parser.tokenize(test_sentence)) input1, sent_length = get_input_vector(glove_model, sentence) class_pred = model(input1, sent_length) print("Sentence: " + test_sentence) _, pred = class_pred.max(dim=1) print("Prediction:\t" + emotions[pred[0]]) print("Output Values:") percentages = torch.nn.functional.softmax(class_pred, dim=1) * 100 for i in range(len(emotions)): print(emotions[i] + " %" + str(percentages.data.tolist()[0][i]))
def build_vocab(json:str, threshold:int, keeppunctuation: bool, host_address:str, character_level:bool=False, zh:bool=True ): """Build vocabulary from csv file with a given threshold to drop all counts < threshold Args: csv (string): Input csv file. Needs to be tab separated and having a column named 'caption' Modiefied: json(string): Input json file. Shoud have a column named 'caption' threshold (int): Threshold to drop all words with counts < threshold keeppunctuation (bool): Includes or excludes punctuation. Returns: vocab (Vocab): Object with the processed vocabulary """ #df = pd.read_csv(csv, sep='\t') df = pd.read_json(json) counter = Counter() if zh: parser = CoreNLPParser(host_address) for i in tqdm(range(len(df)), leave=False): caption = str(df.loc[i]['caption']) # Remove all punctuations if not keeppunctuation: caption = re.sub("[{}]".format(punctuation),"",caption) if character_level: tokens = list(caption) else: tokens = list(parser.tokenize(caption)) counter.update(tokens) else: punctuation = ',.()' for i in tqdm(range(len(df)), leave=False): caption = str(df.loc[i]['caption']) # Remove all punctuations if not keeppunctuation: caption = re.sub("[{}]".format(punctuation),"",caption) if character_level: tokens = list(caption) else: tokens = caption.split() counter.update(tokens) words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
class NLTK_NLP(): def __init__(self, ip_port): self.dep_parser = CoreNLPDependencyParser(url=ip_port) self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner') self.parser = CoreNLPParser(url=ip_port) self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos') def generate_dependency_tree(self, sentence): '''what is the name of the asteroid ?''' dependency_tree, = self.dep_parser.raw_parse(sentence=sentence) return dependency_tree def generate_dependency_graph(self, sentence): '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'} 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.''' dependency_tree, = self.dep_parser.raw_parse(sentence=sentence) return DependencyGraph(dependency_tree.to_conll(10)) def generate_constituency_tree(self, sentence): '''input: one question''' tree_list = list(self.parser.raw_parse(sentence=sentence)) return tree_list[0] def get_pos(self, sentence): '''What is the airspeed of an unladen swallow ? [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] ''' pos_list = list(self.pos_tagger.tag(sentence.split())) # tokens = nltk.word_tokenize(sentence) # wordpos = nltk.pos_tag(tokens) return pos_list def get_pos_by_tokens(self, tokens): '''What is the airspeed of an unladen swallow ?''' pos_list = list(self.pos_tagger.tag(tokens)) return pos_list def get_ner(self, sentence): # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split() '''april the 26th, 1882 is the birth date of which athletes ? [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'), ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'), ('athletes', 'O'), ('?', 'O')]''' sequence_ner_tuple_list = self.ner_parser.tag(sentence.split()) sequence_ner_list = [] for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list): sequence_ner_list.append(ner_tag) return sequence_ner_list def get_toknizer(self, sentence): return list(self.parser.tokenize(sentence)) def find_phrases(self, tree, phrase_tag='NP'): return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
class Lex_parser: def __init__(self, tag_id_initialized=False, tag_id=None, uncased=True): self.uncased = uncased self.tag_id_initialized = tag_id_initialized if tag_id_initialized: self.tag_to_id = tag_id else: self.tag_to_id = {"CLSSEP": 0, "UNKNOWN": 1} self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos') self.basic_tokenizer = BasicTokenizer() def tokenize(self, sentence): return list(self.parser.tokenize(sentence)) def convert_sentence_to_tags(self, sentence: Union[str, list]): if type(sentence) == str: if self.uncased: sentence = sentence.lower() else: sentence = " ".join(sentence) if self.uncased: sentence = sentence.lower() sentence = self.basic_tokenizer.tokenize(sentence) # print("sentence here,", sentence) sentence = list(map(lambda x: x.upper() if x == 'i' else x, sentence)) tags = self.parser.tag(sentence) # print("sentence here,", sentence) # print("tags here", tags) # exit(-2) if not self.tag_id_initialized: for tag in tags: if tag[1] not in self.tag_to_id: self.tag_to_id[tag[1]] = len(self.tag_to_id) return tags def convert_tags_to_ids(self, tags): res = list(map(lambda x: self.tag_to_id[x[1]], tags)) # print("to ids ==") # print(len(tags), tags) # print(len(res), res) return res def convert_sentence_to_ids(self, sentence: Union[str, list]): if not self.parser: self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos') tags = self.convert_sentence_to_tags(sentence) ids = self.convert_tags_to_ids(tags) print(type(sentence), len(sentence), len(tags), len(ids)) return list(ids)
def tokenize_and_write_to_tokenresult(text, dest): #https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK url = 'http://localhost:9000' dep_parser = CoreNLPDependencyParser(url=url) tokens = CoreNLPParser(url) res = tokens.tokenize(text) for token in res: if token == '.': dest.write(token.lower() + '\n') else: dest.write(token.lower() + ' ')
def process(input_labels_csv: str, output_file: str, hostname="http://localhost:9000", character_level: bool = False): captions = pd.read_csv(input_labels_csv, sep='\t', encoding='utf-8') parser = CoreNLPParser(hostname) captions = captions[captions.caption.notnull()] captions['tokens'] = None for idx, row in tqdm(captions.iterrows(), total=len(captions)): caption = row['caption'] # Remove punctuation caption = re.sub("[{}]".format(punctuation), "", caption) if character_level: captions.at[idx, 'tokens'] = list(caption) else: captions.at[idx, 'tokens'] = list(parser.tokenize(caption)) captions.to_json(output_file)
def tokenize(text, url='http://localhost:9000'): """CoreNLP 分词 Parameters ---------- text : str 要分词的文本 url : str, optional CoreNLP Web 服务器的 URL (default: 'http://localhost:9000') parser : 用于编成接口,不要在命令行使用 Returns ------- str 返回空格分隔 token 的句子 """ parser = CoreNLPParser(url) tokens = list(parser.tokenize(text)) return ' '.join(tokens)
def tokenize_tweet(t): """ Use the Stanford's PTBTokenizer to tokenize the tweet. Requires Stanford CoreNLP Server to be running. For setup information see: [ https://stanfordnlp.github.io/CoreNLP/index.html, https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/ ] Here we use the wrapper from `nltk` package instead of stanfordcorenlp From the directory where you setup the stanford NLP, Run the server: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000 Args: t: The tweet to be tokenized Returns: t (generator): A generator for the list of tokens generated from the tweet. """ parser = CoreNLPParser(url='http://localhost:9000/') return parser.tokenize(t)
class StanfordNLTKWrapper: def __init__(self, config_file_path='aida_event/config/xmie.json'): self._config = read_dict_from_json_file(config_file_path) self._domain_name = self._config['common_tools']['stanford_url'] self._port_number = self._config['common_tools']['stanford_port'] self._pos_model = self._config['common_tools']['stanford_pos_model'] self._pos_jar = self._config['common_tools']['stanford_pos_jar'] self._parser_model = self._config['common_tools'][ 'stanford_parser_model'] self._parser_jar = self._config['common_tools']['stanford_parser_jar'] self._core_nlp_parser = CoreNLPParser( url='%s:%s' % (self._domain_name, self._port_number)) self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model, path_to_jar=self._pos_jar) self._dep_parser = StanfordDependencyParser( path_to_jar=self._parser_jar, path_to_models_jar=self._parser_model, java_options='-Xmx16G') def tokenizer(self, input_text): return list(self._core_nlp_parser.tokenize(input_text)) def pos_tag(self, input_tokenized_sentence): return self._pos_tagger.tag(input_tokenized_sentence) def pos_tag_sentences(self, input_tokenized_sentences): return self._pos_tagger.tag_sents(input_tokenized_sentences) def dependency_parser(self, input_tokenized_pos_tagged_sentence): return self._dep_parser.tagged_parse( input_tokenized_pos_tagged_sentence) def dependency_parser_sentences(self, input_tokenized_pos_tagged_sentences): return self._dep_parser.tagged_parse_sents( input_tokenized_pos_tagged_sentences)
class CoreNLPTokenizer(Tokenizer): def __init__( self, url: str = 'http://localhost:9000', encoding: str = 'utf-8', start_tokens: List[str] = None, end_tokens: List[str] = None, ): self._parser = CoreNLPParser(url, encoding, 'pos') self._start_tokens = start_tokens or [] # We reverse the tokens here because we're going to insert them with `insert(0)` later; # this makes sure they show up in the right order. self._start_tokens.reverse() self._end_tokens = end_tokens or [] @overrides def tokenize(self, text: str) -> List[Token]: tokens = [Token(t) for t in self._parser.tokenize(text)] for start_token in self._start_tokens: if isinstance(start_token, int): token = Token(text_id=start_token, idx=0) else: token = Token(text=start_token, idx=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): token = Token(text_id=end_token, idx=0) else: token = Token(text=end_token, idx=0) tokens.append(token) return tokens
def test_connection(self): st = CoreNLPParser() print('testing connection...') list(st.tokenize("test")) print('Server ready!')
def build_vocab(input_json: str, threshold: int, keep_punctuation: bool, host_address: str, character_level: bool = False, zh: bool = True ): """Build vocabulary from csv file with a given threshold to drop all counts < threshold Args: input_json(string): Preprossessed json file. Structure like this: { 'audios': [ { 'audio_id': 'xxx', 'captions': [ { 'caption': 'xxx', 'cap_id': 'xxx' } ] }, ... ] } threshold (int): Threshold to drop all words with counts < threshold keep_punctuation (bool): Includes or excludes punctuation. Returns: vocab (Vocab): Object with the processed vocabulary """ data = json.load(open(input_json, "r"))["audios"] counter = Counter() if zh: from nltk.parse.corenlp import CoreNLPParser from zhon.hanzi import punctuation parser = CoreNLPParser(host_address) for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): for cap_idx in range(len(data[audio_idx]["captions"])): caption = data[audio_idx]["captions"][cap_idx]["caption"] # Remove all punctuations if not keep_punctuation: caption = re.sub("[{}]".format(punctuation), "", caption) if character_level: tokens = list(caption) else: tokens = list(parser.tokenize(caption)) data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) counter.update(tokens) else: punctuation = ',.():;?!"\'' for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): for cap_idx in range(len(data[audio_idx]["captions"])): caption = data[audio_idx]["captions"][cap_idx]["caption"].lower() # Remove all punctuations if not keep_punctuation: caption = re.sub("[{}]".format(punctuation), " ", caption) caption = re.sub(" +", " ", caption) if character_level: tokens = list(caption) else: tokens = caption.split() data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) counter.update(tokens) json.dump({ "audios": data }, open(input_json, "w"), indent=4) words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word("<pad>") vocab.add_word("<start>") vocab.add_word("<end>") vocab.add_word("<unk>") # Add the words to the vocabulary. for word in words: vocab.add_word(word) return vocab
km.fit(self.E) self.T = km.cluster_centers_.astype(np.float32) self.T /= np.linalg.norm(self.T, axis=-1, keepdims=True) return self if __name__ == '__main__': w2v = word2vec('../../data/tokenized.txt') w2v.embed('../word_vector/yelp.w2v', 400) print("Word2Vec training finished.") origin_path = '../../data/converted' dest_path = '../../data/absc_encoded' names = ['train.txt', 'dev.txt', 'test.txt'] for name in names: print(f"Processing {name}...") with open(os.path.join(origin_path, name), 'r') as f: lines = f.readlines() with open(os.path.join(dest_path, name), 'w') as f: for line in tqdm(lines): review = line.split('\t')[0] score = line.split('\t')[1] tokens = tokenizer.tokenize(review) indices = [ str(w2v.w2i[token]) if token in w2v.w2i else str(w2v.w2i['<unk>']) for token in tokens ] f.write(' '.join(indices) + '\t' + score)
def _execute(args): index, line = args if data_format == 'jsonl': d = json.loads(line) label = d.get('gold_label', '-').strip().lower() sent1 = d['sentence1'].strip() sent2 = d['sentence2'].strip() elif data_format == 'tsv': l = line.split('\t') if corpus_type == 'snli': label = l[0].strip().lower() sent1 = l[5].strip() sent2 = l[6].strip() elif corpus_type == 'xnli': sent1 = l[0].strip() sent2 = l[1].strip() label = l[2].strip().lower() else: raise ValueError(f'无效的 `corpus_type`: {corpus_type}') else: raise ValueError(f'无效的 `data_format`: {data_format}') if label == 'contradictory': label = 'contradiction' if (not sent1) or (not sent2): tqdm.write(f'第[{index}]行: 空白语料,将被忽略. {line}', file=sys.stderr) return sent1 = remove_cjk_whitespace(sent1) sent2 = remove_cjk_whitespace(sent2) segments = [] if corenlp: parser = CoreNLPParser(corenlp) for sent in (sent1, sent2): tokens = list(parser.tokenize(sent)) segments.append(' '.join(tokens)) elif ltp: for sent in (sent1, sent2): tokens = [] r = requests.post(ltp, data={'s': sent, 'x': 'n', 't': 'ws'}) r.raise_for_status() ltp_result = r.json() for ltp_sent in ltp_result[0]: for ltp_w in ltp_sent: ws = ltp_w['cont'].strip() if ws: tokens.append(ws) segments.append(' '.join(tokens)) result = json.dumps( { 'index': index, 'gold_label': label, 'sentence1': segments[0], 'sentence2': segments[1], }, ensure_ascii=False) if output_file: with lock: print(result, file=f_out, flush=flush) else: tqdm.write(result)
def build_vocab(df: pd.DataFrame, threshold: int, keeppunctuation: bool, host_address: str, character_level: bool = False, zh: bool = True, pretokenized: bool = False): from nltk.parse.corenlp import CoreNLPParser from zhon.hanzi import punctuation """Build vocabulary from csv file with a given threshold to drop all counts < threshold Args: df (pd.DataFrame): Input dataframe. Shoud have a column named 'caption' threshold (int): Threshold to drop all words with counts < threshold keeppunctuation (bool): Includes or excludes punctuation. Returns: vocab (Vocab): Object with the processed vocabulary """ counter = Counter() if pretokenized: assert "tokens" in df.columns, "Pretokenized words should be in the `token` column" if zh: parser = CoreNLPParser(host_address) for i in tqdm(range(len(df)), leave=False, ascii=True): if pretokenized: tokens = df.iloc[i]['tokens'] else: caption = str(df.loc[i]['caption']) # Remove all punctuations if not keeppunctuation: caption = re.sub("[{}]".format(punctuation), "", caption) if character_level: tokens = list(caption) else: tokens = list(parser.tokenize(caption)) counter.update(tokens) else: punctuation = ',.()' for i in tqdm(range(len(df)), leave=False, ascii=True): if pretokenized: tokens = df.loc[i]['tokens'] else: caption = str(df.loc[i]['caption']) # Remove all punctuations if not keeppunctuation: caption = re.sub("[{}]".format(punctuation), "", caption) if character_level: tokens = list(caption) else: tokens = caption.split() counter.update(tokens) words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
path_to_model= "C:/Users/user/Desktop/NLP/sanford-segmenter-2015-12-09/data/pku.gz", path_to_dict= "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz", ) text = ("这是斯坦福中文分词器测试") segmenter.segment(text) #這台跑不出來QQ #%% NLTK CoreNLPParser #必須先在cmd執行java(nlp start server.txt) from nltk.parse.corenlp import CoreNLPParser corenlp_parser = CoreNLPParser('http://localhost:9001', encoding='utf8') token_list = list(corenlp_parser.tokenize(ptt_sim)) #%% thulac import thulac thu1 = thulac.thulac(seg_only=True) thu1.cut(ptt_sim, text=True) thu1.cut(news_sim, text=True) #%% CKIWP import os import subprocess def ckipws_tokenizes(input): os.chdir("C:/Users/user/Desktop/NLP/CKIPWS")
def tokenize_caption(input_json: str, keep_punctuation: bool = False, host_address: str = None, character_level: bool = False, zh: bool = True ): """Build vocabulary from csv file with a given threshold to drop all counts < threshold Args: input_json(string): Preprossessed json file. Structure like this: { 'audios': [ { 'audio_id': 'xxx', 'captions': [ { 'caption': 'xxx', 'cap_id': 'xxx' } ] }, ... ] } threshold (int): Threshold to drop all words with counts < threshold keep_punctuation (bool): Includes or excludes punctuation. Returns: vocab (Vocab): Object with the processed vocabulary """ data = json.load(open(input_json, "r"))["audios"] if zh: from nltk.parse.corenlp import CoreNLPParser from zhon.hanzi import punctuation parser = CoreNLPParser(host_address) for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): for cap_idx in range(len(data[audio_idx]["captions"])): caption = data[audio_idx]["captions"][cap_idx]["caption"] # Remove all punctuations if not keep_punctuation: caption = re.sub("[{}]".format(punctuation), "", caption) if character_level: tokens = list(caption) else: tokens = list(parser.tokenize(caption)) data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) else: punctuation = ',.():;?!"\'' for audio_idx in tqdm(range(len(data)), leave=False, ascii=True): for cap_idx in range(len(data[audio_idx]["captions"])): caption = data[audio_idx]["captions"][cap_idx]["caption"].lower() # Remove all punctuations if not keep_punctuation: caption = re.sub("[{}]".format(punctuation), " ", caption) caption = re.sub(" +", " ", caption) if character_level: tokens = list(caption) else: tokens = caption.split() data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens) json.dump({ "audios": data }, open(input_json, "w"), indent=4)
#!/usr/bin/python3 # coding: utf-8 ################################################################## ## CoreNLP # server$ cd ~/datasets/Lib/CoreNLP/stanford-corenlp-full-2018-01-31 # server$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \ -preload tokenize,ssplit,pos,lemma,parse,depparse \ -status_port 9000 -port 9000 -timeout 15000 # client$ ssh -fN -L 9000:localhost:9000 [email protected] -p 23622 # 将本地 9000 (left) 转向到 lab_server 的 9000 (right) from nltk.parse.corenlp import CoreNLPParser stanford = CoreNLPParser() str = 'proved to be fake, made-up' token = list(stanford.tokenize(str)); print(token) # ['proved', 'to', 'be', 'fake', ',', 'made-up'] str = 'proved to be fake, made-up' # 空格不影响 token = list(stanford.tokenize(str)); print(token) # ['proved', 'to', 'be', 'fake', ',', 'made-up'] # ../jptstanford_corenlp/l1_tokenizer.py 也有相同的功能, 但那个需要 root 权限, 很烦
def segment_one(url, s): parser = CoreNLPParser(url, tagtype='pos') return list(parser.tokenize(pre_segment(s)))