def stanford_tokenize( texts: typing.List[str]) -> typing.List[typing.List[str]]: """ This function takes string list and then tokenize every str in the list. """ tokenizer = StanfordTokenizer() return tokenizer.tokenize_sents(texts)
def tokenize_stopwords_stemmer(texts): #texts:列表存放的字符串 #用斯坦福的分词采用这一段,用普通分词时不用这个 #tokenize Str_texts = texts[0] #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") texts_tokenized = tokenizer.tokenize(Str_texts) #输入必须是字符串 p1 = r'[-@<#$%^&*].+' pa1 = re.compile(p1) texts_filtered0 = [ document for document in texts_tokenized if not document in pa1.findall(document) ] p2 = r'.+[-_\./].+' pa2 = re.compile(p2) texts_filtered = [] for document in texts_filtered0: if document in pa2.findall(document): if document.find('_') > -1: texts_filtered = texts_filtered + document.split('_') elif document.find('-') > -1: texts_filtered = texts_filtered + document.split('-') elif document.find('.') > -1: texts_filtered = texts_filtered + document.split('.') else: texts_filtered.append(document) texts_filtered = [ document for document in texts_filtered if document != '' and document != "''" and document != "``" ] #stopwords english_stopwords = stopwords.words('english') #得到停词 texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords ] # english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/', '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?' ] #得到标点 texts_filtered = [ document for document in texts_filtered_stopwords if not document in english_punctuations ] # porter = nltk.PorterStemmer() texts_Stemmered = [porter.stem(t) for t in texts_filtered] #列表类型 return texts_Stemmered #返回一个列表 """
def segment(texts): tk = StanfordTokenizer() results = {} for text in texts: words = tk.tokenize(text) segmented = ' '.join(words).lower() results[text] = segmented return results
def tokenize_q(qa, phase): qas = len(qa) MyTokenizer = StanfordTokenizer() for i, row in enumerate(tqdm(qa)): row['question_toked'] = MyTokenizer.tokenize(row['question'].lower())[:14] if i % 50000 == 0: json.dump(qa, open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w')) if i == qas - 1: json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
def simp_syn_sent(sent): strs = "" # the original tokens in the sent #import pdb; pdb.set_trace() #print "syn sent: ", sent #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(sent) tokens.insert(0, '') result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #w = result.tree() #print "parse_tree:", w #TODO: use the tree structure, check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) #import pdb; pdb.set_trace() if len(sent) > 0: strs = simp_coordi_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_subordi_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_advcl_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_parti_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_adjec_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_appos_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_passive_sent(tokens, node_list) if len(strs) > 0: return strs return strs
def segment_en(texts, flag_keep_number=False): tk = StanfordTokenizer() results = {} for text in texts: if flag_keep_number: words = tk.tokenize(text) else: words = map(replace_number, tk.tokenize(text)) segmented = ' '.join(words).lower() results[text] = segmented return results
def Tokenize_stopwords_stemmer(texts): #print time() #用斯坦福的分词采用这一段,用普通分词时不用这个 #tokenize Str_texts = texts[0] print os.getcwd() #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") texts_tokenized = tokenizer.tokenize(Str_texts) #输入必须是字符串 #print time() p2 = r'.+[-_\./"].+' pa2 = re.compile(p2) texts_filtered = [] for document in texts_tokenized: if document in pa2.findall(document): if document.find('_') > -1: texts_filtered = texts_filtered + document.split('_') elif document.find('-') > -1: texts_filtered = texts_filtered + document.split('-') elif document.find('.') > -1: texts_filtered = texts_filtered + document.split('.') else: texts_filtered.append(document) #print time() p1 = r'[-@<#$%^&*].+' pa1 = re.compile(p1) p3 = r'.+">' pa3 = re.compile(p3) english_stopwords = stopwords.words('english') #得到停词 english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/', '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?' ] #得到标点 texts_filtered0 = [] for document in texts_filtered: if document in pa1.findall(document) or document in pa3.findall( document ) or document == '' or document == "''" or document == "``" or document in english_stopwords or document in english_punctuations: pass else: texts_filtered0.append(document) #print time() porter = nltk.PorterStemmer() texts_Stemmered = [porter.stem(t) for t in texts_filtered0] #列表类型 #print time() return texts_Stemmered #返回一个列表
def readwordarr(isTokenize=True): posWords = [] negWords = [] stopwords = getstopword() if isTokenize: tokenizer = StanfordTokenizer() with open(negfilepath, 'r', encoding='utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding='utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) else: with open(negfilepath, 'r', encoding='utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding='utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) return posWords, negWords
def readwordarr(isTokenize = True): posWords = [] negWords = [] stopwords = getstopword() if isTokenize: tokenizer = StanfordTokenizer() with open(negfilepath, 'r', encoding = 'utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding = 'utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) else: with open(negfilepath, 'r', encoding = 'utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr=line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding = 'utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr=line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) return posWords,negWords
class WordSegment(object): def __init__(self, user_dict=None): self.conf_io = conf.load("io") self.conf_corenlp = conf.load("stanford_corenlp") self.conf_embedding = conf.load("embedding") conf_tokenizer = self.conf_corenlp["tokenizer"] conf_postagger = self.conf_corenlp["postagger"] prefix = self.conf_corenlp["prefix"] self.enTokenizer = StanfordTokenizer( path_to_jar=prefix + conf_tokenizer["path_to_jar"] ) self.zh_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_zh"], path_to_jar=prefix + conf_postagger["path_to_jar"] ) self.en_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_en"], path_to_jar=prefix + conf_postagger["path_to_jar"] ) # TODO: # 這裡要加上自定義字典 def get_tokens(self, text): tokens = self.enTokenizer.tokenize(text) return self.en_tagger.tag(tokens) def get_new_words(self, text): pass
def dtm_builder(self, runType): ''' desc: this function coordinates all activities of data processing and within the function tokenizes all elements of the sequence returns: a one hot encoded 3 dimensional matrix of training data and testing data, which represents the next element in the sequence ''' if (runType=='training'): dataFiles = ['{}/{}'.format(DATA_DIR, file) for file in os.listdir(DATA_DIR) if file.endswith('.txt')] allTxt = '<eos>'.join([self._readFile(file) for file in dataFiles]) elif (runType=='testing'): inputString = input('Enter test string: ') allTxt = inputString assert(type(allTxt)==str), 'input must be a string' allTxtTok = StanfordTokenizer().tokenize(allTxt) allTxt_allSeq = '||*||'.join(allTxtTok).split('<eos>') allTxt_bySeq = [seq.split('||*||') for seq in allTxt_allSeq] allTxt_bySeq = [list(filter(None, seq)) for seq in allTxt_bySeq] for seq in allTxt_bySeq: seq.append('<eos>') txtDocTokBySeqPad = self._padSeq(allTxt_bySeq) unqVoc_LookUp = self._buildVocLookUp(txtDocTokBySeqPad, runType) if(runType == 'training'): oheTrainData, oheTrainLabel = self._oneHotEncode(txtDocTokBySeqPad, unqVoc_LookUp, runType) return [oheTrainData, oheTrainLabel] else: oheTrainData = self._oneHotEncode(txtDocTokBySeqPad, unqVoc_LookUp, runType) return [oheTrainData, unqVoc_LookUp, inputString]
def simp_syn_sent_(sent): strs = "" # the original tokens in the sent """ lst1 = "Peter, who liked fruits, ate an apple.".split() _lst = sent.split() #import pdb; pdb.set_trace() if lst1 == _lst: return "Peter liked fruits. Peter ate an apple." """ #import pdb; pdb.set_trace() #print(sent) #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(str(sent)) #tokens = wordpunct_tokenize(str(sent)) tokens.insert(0, '') result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #import pdb; pdb.set_trace() #w = result.tree() #print "parse_tree:", w #for row in result.triples(): # print(row) #import pdb; pdb.set_trace() #TODO: use the tree structure, check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) #import pdb; pdb.set_trace() #strs = simp_coordi_sent(tokens, node_list) #strs = simp_subordi_sent(tokens, node_list) #strs = simp_advcl_sent(tokens, node_list) #strs = simp_parti_sent(tokens, node_list) strs = simp_adjec_sent(tokens, node_list) #strs = simp_appos_sent(tokens, node_list) #strs = simp_passive_sent(tokens, node_list) return strs
def data(): with open("wonderland.txt", "r", encoding="utf-8-sig") as file: words = StanfordTokenizer(r"C:\stanford-postagger-2016-10-31\stanford-postagger.jar") \ .tokenize(file.read().lower()) voc_list = sorted(set(words)) vocabulary = dict(zip(voc_list, itertools.count())) words_idx = [vocabulary[word] for word in words] return voc_list, vocabulary, words_idx
class POSTagger: """POSTagger creates a POS tagger for german language. Different tagger are available to use.""" STAN = "stanford-hgc-tagger" SFT = "stanford-fast-tagger" TT = "tree-tagger" SPACY = "spacy-tagger" # paths to Stanford tagger modules __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar" __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/" def __init__(self, tagger): """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger.""" self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar) if tagger == POSTagger.STAN: self.tagger_name = POSTagger.STAN self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-hgc.tagger") elif tagger == POSTagger.SFT: self.tagger_name = POSTagger.SFT self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-fast.tagger") elif tagger == POSTagger.TT: self.tagger_name = POSTagger.TT self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards elif tagger == POSTagger.SPACY: self.tagger_name = POSTagger.SPACY self.__tagger = spacy.load('de') else: raise Exception("Wrong tagger parameter.") def tag(self, text): """POS tag tokenized text.""" if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN: tokens = self.__tokenizer.tokenize(text) return self.__tagger.tag(tokens) elif self.tagger_name == POSTagger.TT: tags = self.__tagger.tag_text(text) tuple_list = [] tag_list = treetaggerwrapper.make_tags(tags) for item in tag_list: tuple_list.append((item[0], item[1])) return tuple_list elif self.tagger_name == POSTagger.SPACY: tags = self.__tagger(text) tuple_list = [] for word in tags: tuple_list.append((word.orth_, word.tag_)) return tuple_list else: pass #tagger = POSTagger("spacy-tagger") #doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.") #print(tagger.tag("Ich werde morgen in die Schule gehen.")) #print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
def instance_tokenizer(language, stanfordpath=None): # you can add more params or kinds of tokenizer from here: # http://www.nltk.org/api/nltk.tokenize.html if stanfordpath: tok = StanfordTokenizer(path_to_jar=stanfordpath) else: tok = WordPunctTokenizer() return tok
def tokenize_stopwords_stemmer(texts): Str_texts = texts[0] # tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer( path_to_jar=r"C:\Users\zw\Desktop\stanford-parser.jar") # path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子 java_path = 'E:soft/Java/jdk1.8.0_121/bin/java.exe' os.environ['JAVAHOME'] = java_path texts_tokenized = tokenizer.tokenize(Str_texts) # 输入必须是字符串,进行分词 # print(texts_tokenized) p1 = r'[-@<#$%^&*].+' pa1 = re.compile(p1) # re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例) texts_filtered0 = [document for document in texts_tokenized if not document in pa1.findall(document)] p2 = r'.+[-_\/].+' # 将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式 pa2 = re.compile(p2) texts_filtered = [] for document in texts_filtered0: if document in pa2.findall(document): if document.find('_') > -1: # split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list) texts_filtered = texts_filtered + document.split('_') elif document.find('-') > -1: texts_filtered = texts_filtered + document.split('-') elif document.find('.') > -1: texts_filtered = texts_filtered + document.split('.') elif document.find('/') > -1: texts_filtered = texts_filtered + document.split('/') else: texts_filtered.append(document) texts_filtered = [document for document in texts_filtered if document != '' and document != "''" and document != "``"] # 过滤掉空格,单引号和-- # # stopwords # english_stopwords =stopwords.words('english') # 得到停词 # texts_filtered_stopwords = [document for document in texts_filtered if not document in english_stopwords] # 过滤掉停词 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '||', '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '0', '$', '^', '/*', '*/', '/**', '**/', '**', '-', '_', '__', '|', '+', '=', r'-?-', r'@?'] # 得到标点 texts_filtered = [document for document in texts_filtered if not document in english_punctuations] # 过滤掉标点 return texts_filtered
def __init__(self, task_queue, result_queue): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.result_queue = result_queue self.tokenizer = StanfordTokenizer(options={"ptb3Escaping": True}) print '%s: Loading pickles...' % self.name self.map_word_index = map_word_index_model print '%s: Done.' % self.name
def __init__(self, user_dict=None): self.conf_io = conf.load("io") self.conf_corenlp = conf.load("stanford_corenlp") self.conf_embedding = conf.load("embedding") conf_tokenizer = self.conf_corenlp["tokenizer"] conf_postagger = self.conf_corenlp["postagger"] prefix = self.conf_corenlp["prefix"] self.enTokenizer = StanfordTokenizer( path_to_jar=prefix + conf_tokenizer["path_to_jar"] ) self.zh_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_zh"], path_to_jar=prefix + conf_postagger["path_to_jar"] ) self.en_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_en"], path_to_jar=prefix + conf_postagger["path_to_jar"] )
def data(): with open("wonderland.txt", "r", encoding="utf-8-sig") as file: return [ word.lower() for word in StanfordTokenizer( path_to_jar= r"C:\stanford-postagger-2016-10-31\stanford-postagger.jar", options={ "normalizeParentheses": "false", "normalizeOtherBrackets": "false" }).tokenize(file.read()) ]
def simp_syn_sent_(sent): strs = "" #print(sent) #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(str(sent)) #tokens = wordpunct_tokenize(str(sent)) tokens.insert(0, '') re = list(eng_parser.raw_parse(sent))[0] root = re.root['word'] node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in re.nodes.items(): node_list.append(base.get_triples(node)) #result = list(eng_parser.raw_parse(sent))[0] #root = result.root['word'] strs = simp_relcl_sent(tokens, node_list) return strs
def tokenize(text_list, clean_html=False, tokenizer="twitter", remove_reps=True, spell_correct=True): if tokenizer == "stanford": tolkenizer_obj = StanfordTokenizer() elif tokenizer == "twitter": tolkenizer_obj = TweetTokenizer() else: tolkenizer_obj = StringTokenizer() token_list = [] for text in text_list: if clean_html: text = BeautifulSoup(text).get_text() if remove_reps: text = re.sub(r'(.)\1{2,}', r'\1\1', text) tokens = tolkenizer_obj.tokenize(text) if spell_correct: tokens = [spell(t) for t in tokens] token_list.append(tokens) return token_list
def stanford_tokenizer(str): tokenizer = StanfordTokenizer( path_to_jar= 'D:/software/stanford-parser-full-3.7/stanford-parser-3.7.0-models.jar' ) # sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." return tokenizer.tokenize(str) # if __name__=='__main__': # sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." # result = stanford_tokenizer(sent) # print(result) # st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # from nltk.tokenize import StanfordTokenizer # s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." # StanfordTokenizer().tokenize(s) # s = "The colour of the wall is blue." # StanfordTokenizer(options={"americanize": True}).tokenize(s)
def __init__(self, **kwargs): self.conf_io = conf.load("io") self.conf_corenlp = conf.load("stanford_corenlp") self.conf_embedding = conf.load("embedding") conf_segmenter = self.conf_corenlp["segmenter"] conf_tokenizer = self.conf_corenlp["tokenizer"] conf_postagger = self.conf_corenlp["postagger"] prefix = self.conf_corenlp["prefix"] self.segmenter = StanfordSegmenter( path_to_jar=prefix + conf_segmenter["path_to_jar"], path_to_sihan_corpora_dict=prefix + conf_segmenter["path_to_sihan_corpora_dict"], path_to_model=prefix + conf_segmenter["path_to_model"], path_to_dict=prefix + conf_segmenter["path_to_dict"], path_to_slf4j=prefix + conf_segmenter["path_to_slf4j"], encoding=conf_segmenter["encoding"]) self.enTokenizer = StanfordTokenizer(path_to_jar=prefix + conf_tokenizer["path_to_jar"]) self.zh_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_zh"], path_to_jar=prefix + conf_postagger["path_to_jar"]) self.en_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_en"], path_to_jar=prefix + conf_postagger["path_to_jar"]) self.frequency = defaultdict(int) pynlpir.open() pynlpir.nlpir.ImportUserDict(conf.load("pynlpir")["user_dict"], Overwrite=False) try: self.excluded_docs = kwargs["excluded_docs"] except: self.excluded_docs = [""] # experimental features self.f_token_indexes = prefix + conf.load("pynlpir")["user_dict"]
def _get_sentence_embeddings(sentences, ngram='bigrams', model='concat_wiki_twitter'): """ Returns a numpy matrix of embeddings for one of the published models. It handles tokenization and can be given raw sentences. Arguments: - ngram: 'unigrams' or 'bigrams' - model: 'wiki', 'twitter', or 'concat_wiki_twitter' - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...] """ wiki_embeddings = None twitter_embbedings = None tokenized_sentences_NLTK_tweets = None tokenized_sentences_SNLP = None if model == "wiki" or model == 'concat_wiki_twitter': tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join( sentences) #just a trick to make things faster tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split( ' <delimiter> ') assert (len(tokenized_sentences_SNLP) == len(sentences)) if ngram == 'unigrams': wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH) else: wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH) if model == "twitter" or model == 'concat_wiki_twitter': tknzr = TweetTokenizer() tokenized_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences) if ngram == 'unigrams': twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \ MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH) else: twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \ MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH) if model == "twitter": return twitter_embbedings elif model == "wiki": return wiki_embeddings elif model == "concat_wiki_twitter": return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1) sys.exit(-1)
def get_sentence_embeddings(sentences, ngram='uni'): tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(sentences) #just a trick to make things faster tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split( ' <delimiter> ') if len(tokenized_sentences_SNLP) != len(sentences): print('SENT2VEC TOKENIZATION FAILED') tokenized_sentences_SNLP = sentences #assert(len(tokenized_sentences_SNLP) == len(sentences)) if ngram == 'uni': embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_TORONTOBOOKS_UNIGRAMS, FASTTEXT_EXEC_PATH) elif ngram == 'bi': embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_TORONTOBOOKS_BIGRAMS, FASTTEXT_EXEC_PATH) else: raise (NotImplementedError) return embeddings
def __init__(self, tagger): """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger.""" self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar) if tagger == POSTagger.STAN: self.tagger_name = POSTagger.STAN self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-hgc.tagger") elif tagger == POSTagger.SFT: self.tagger_name = POSTagger.SFT self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-fast.tagger") elif tagger == POSTagger.TT: self.tagger_name = POSTagger.TT self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards elif tagger == POSTagger.SPACY: self.tagger_name = POSTagger.SPACY self.__tagger = spacy.load('de') else: raise Exception("Wrong tagger parameter.")
def par_tokenize(text_list, clean_html=False, tokenizer="twitter", remove_reps=True, spell_correct=True): if tokenizer == "stanford": tolkenizer_obj = StanfordTokenizer() elif tokenizer == "twitter": tolkenizer_obj = TweetTokenizer() else: tolkenizer_obj = StringTokenizer() import multiprocessing as mp from functools import partial pool = mp.Pool(NUM_PROC) tolkenize_func = partial(__tolkenize_text_blob, clean_html=clean_html, remove_reps=remove_reps, spell_correct=spell_correct, tolkenizer_obj=tolkenizer_obj) token_list = pool.map(tolkenize_func, text_list) return token_list
def tokenize_and_save_corpus(corpus_filename, new_filename): with open(corpus_filename, 'r') as f: corpus_str = f.read() tokenized = StanfordTokenizer().tokenize(corpus_str) lowered = [w.lower() for w in tokenized] num = r'(?<!\S)(\d*\.?\d+|\d{1,3}(,\d{3})*(\.\d+)?)(?!\S)' number_words = {} new_words = [] for word in lowered: if word in number_words: new_words.extend(number_words[word]) else: numbers = re.findall(num, word) if numbers: number = numbers[0][0] nwords = word_numbers(number) number_words[word] = nwords new_words.extend(nwords) else: new_words.append(word) with open(new_filename, 'w') as f: f.write(' '.join(new_words).encode('utf-8'))
def tokenize(text_rdd, clean_html=False, tokenizer="twitter", remove_reps=True, spell_correct=True): if tokenizer == "stanford": tokenizer_obj = StanfordTokenizer() elif tokenizer == "twitter": tokenizer_obj = TweetTokenizer() else: tokenizer_obj = StringTokenizer() print("Processing {} tokns".format(text_rdd.count())) if (remove_reps): text_rdd = text_rdd.map( lambda text: re.sub(r'(.)\1{2,}', r'\1\1', text)) if clean_html: text_rdd = text_rdd.map(lambda text: BeautifulSoup(text).get_text()) tokens_rdd = text_rdd.map(lambda text: TweetTokenizer().tokenize(text)) if spell_correct: tokens_rdd = tokens_rdd.map(lambda tokens: [spell(t) for t in tokens]) #tokens_rdd = tokens_rdd.map(lambda tokens: [t for t in tokens]) return tokens_rdd
import argparse from nltk.tokenize import StanfordTokenizer aparser = argparse.ArgumentParser( description="Run CoreNLP tokenizer on a TSV definition file") aparser.add_argument( 'input_filepath', type=str, help='input file path') aparser.add_argument( 'output_filepath', type=str, help='output file path') aparser.add_argument( 'corenlp_postagger_path', type=str, help="path to stanford-postagger.jar") opt = aparser.parse_args() tokenizer = StanfordTokenizer(path_to_jar=opt.corenlp_postagger_path, options={"ptb3Escaping": "false", "tokenizePerLine": "true", "tokenizeNLs": "true"}) entries = [] definitions = [] with open(opt.input_filepath) as ifp: for line in ifp: parts = line.strip().split('\t') entries.append(parts[:-1]) definitions.append(parts[-1]) def_str = "\n".join(definitions) tokens = tokenizer.tokenize(def_str) def_str = " ".join(tokens) definitions = def_str.split("*NL*") with open(opt.output_filepath, 'w') as ofp: for entry, definition in zip(entries, definitions): ofp.write("{}\t{}\n".format('\t'.join(entry), definition.strip()))
elif token == '-RCB-': token = '}' return token def tokenize_sentences(tknzr, sentences, to_lower=True): """Arguments: - tknzr: a tokenizer implementing the NLTK tokenizer interface - sentences: a list of sentences - to_lower: lowercasing or not """ return [tokenize(tknzr, s, to_lower) for s in sentences] fileName = sys.argv[1] SNLP_TAGGER_JAR = "/home/pgupta/stanford-postagger.jar" sentences = [] with open(fileName, 'r') as fileinput: for line in fileinput: sentences.append(line) tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(sentences) tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ') for sentence in tokenized_sentences_SNLP: print(sentence)
def relcl(sent): strs = "" #print(sent) #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(str(sent)) #tokens = wordpunct_tokenize(str(sent)) tokens.insert(0, '') re = list(eng_parser.raw_parse(sent))[0] root = re.root['word'] node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in re.nodes.items(): node_list.append(base.get_triples(node)) #result = list(eng_parser.raw_parse(sent))[0] #root = result.root['word'] #strs = simp_relcl_sent(tokens, node_list) dep = eng_parser.raw_parse(sent).next() result = list(dep.triples()) nsubj = "" verb = "" for node in result: if 'acl:relcl' in node[1]: #import pdb; pdb.set_trace() nsubj = node[0][0] verb = node[2][0] #break #import pdb; pdb.set_trace() nsubj_ind = tokens.index(nsubj) verb_ind = tokens.index(verb) #split_ind = tokens.index(COMMA) #import pdb; pdb.set_trace() #if split_ind < verb_ind: _str1 = tokens[:nsubj_ind+1] str1 = ' '.join(_str1) + " . " _str2 = tokens[nsubj_ind+1:] if _str2[0] in PUNCTUATION: _str2.pop(0) if ('which' in _str2[0]) or ('who' in _str2[0]): _str2.pop(0) str2 = base.replace_nsubj(tokens, nsubj) + ' '.join(_str2) strs = str1 + str2 """ stree = [parse.tree() for parse in eng_parser.raw_parse(sent)][0] #import pdb; pdb.set_trace() for postn in stree.treepositions(): if stree.label().endswith("=H"): parentpos = postn[:-1] partial = Tree(stree[parentpos].label(), [ stree[postn] ]) """ #import pdb; pdb.set_trace() #strs = simp_relcl_sent(result) """ lst = [] se = 0 head = "" dependent = "" for nd in re: if 'nsubj' in nd[1] or 'nsubjpass' in nd[1]: head = nd[0][0] dependent = nd[2][0] """ #for node in node_list[1:]: return strs
def __init__(self, classifier, jar_file, field_to_process, output_field): self.classifier = classifier self.jar_file = jar_file self.field_to_process = field_to_process self.output_field = output_field self.tokenizer = StanfordTokenizer(path_to_jar=self.jar_file).tokenize
#lines = f.read().encode('utf-8').split('</text>') #for index, line in enumerate(lines): # remove leading and trailing whitespace lines = file.encode('utf-8').split('</text>') for line in lines: newline = '' try: if "<text xml:space=\"preserve\">" in line and "#REDIRECT" not in line: newline = line[line.find("<text xml:space=\"preserve\">") + len("<text xml:space=\"preserve\">"):] if guess_language(newline) == 'en': s = re.sub( '[^A-Za-z0-9\s.,\'\";?$%+-:!]+', '@', re.sub('\d', '0', newline).replace('[', ' ').replace( ']', ' ').replace('}', ' ').replace('{', ' ')) s2 = StanfordTokenizer().tokenize(s) s3 = [word.encode('ascii') for word in s2] charCounter = 0 tokenCounter = 0 sentStart = 0 deleteThese = [] for index, token in enumerate(s3): if token == '.': if charCounter < 20 or tokenCounter < 5: deleteThese.append([sentStart, index]) charCounter = 0 tokenCounter = 0 sentStart = index + 1 else: charCounter += len(token) tokenCounter += 1
# -*- coding: utf-8 -*- from nltk.tokenize import StanfordTokenizer import time def is_ascii(s): return all(ord(c) < 128 for c in s) last_time = time.time() line_buffer = '' with open('WestburyLab.Wikipedia.Corpus.txt') as infp, open( 'TokenizedCorpus.txt', 'w') as outfp: for e, line in enumerate(infp): if (e + 1) % 10000 == 0: line_buffer = StanfordTokenizer().tokenize(line_buffer) try: outfp.write(' '.join(line_buffer) + '\n') except: for i in xrange(len(line_buffer)): if not is_ascii(line_buffer[i]): line_buffer[i] = '<UNK>' outfp.write(' '.join(line_buffer) + '\n') line_buffer = '' print e + 1, '/ 30749930', float( e + 1) / 30749930, time.time() - last_time if line.strip() == '': continue line_buffer += (line + ' <br> ')
java_path = "C:/Program Files/Java/jre1.8.0_131/bin/java.exe" parser_path = "D:/stanford-parser-full-2016-10-31/stanford-parser.jar" models_path = "D:/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar" engPCFG_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" import os os.environ['JAVA_HOME'] = java_path from nltk.tokenize import StanfordTokenizer tokenizer = StanfordTokenizer(parser_path) from nltk.parse.stanford import StanfordDependencyParser parser = StanfordDependencyParser(parser_path, models_path, engPCFG_path) from nltk.corpus import wordnet import nltk from nltk.tree import Tree from nltk.corpus.reader.wordnet import Lemma from nltk.corpus import semcor from nltk.corpus import wordnet noun = set(['NN', 'NNS', 'NNP', 'NNPS']) verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']) adjective = set(['JJ', 'JJR', 'JJS']) adverb = set(['RB', 'RBR', 'RBS']) substantive = noun | verb | adjective | adverb corp = semcor.sents() tags = semcor.tagged_sents(tag = 'sem')
def _simp_syn_sent(sent, _algs=range(1,10)): strs = "" """ # order the ALG for the better performance(precision/recall) _algs_lst_ = [ paratax.simp_paratax_sent, #punct.simp_punct_sent, subordi.simp_subordi_sent, adverb.simp_adverb_sent, parti.simp_parti_sent, appos.simp_appos_sent, adjec.simp_adjec_sent, coordi.simp_coordi_sent, passive.simp_passive_sent ] """ # the original tokens in the sent #print "syn sent: ", sent #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(sent) #tokens = wordpunct_tokenize(strs) tokens.insert(0, '') #taggers = eng_tagger.tag(sent.split()) result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #w = result.tree() #print "parse_tree:", w #TODO: use the tree structure, Check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) alg = "" """ #import pdb; pdb.set_trace() if len(sent) > 0: for ind in _algs: #import pdb; pdb.set_trace() # if the alg in the choices print "_alg: ", _algs_lst[ind] if len(strs) > 0: return strs, _algs_lst[ind] else: #func = _algs_lst[ind] strs = _algs_lst[ind](tokens,node_list) """ # Use the robest function for the experiments if len(sent) > 0: strs = paratax.simp_paratax_sent(tokens, node_list) if len(strs) > 0: alg = "paratax" return strs, alg else: strs = punct.simp_punct_sent(tokens, node_list) if len(strs) > 0: alg = "punct" return strs, alg else: #strs = coordi.simp_coordi_sent(tokens, node_list) strs = subordi.simp_subordi_sent(tokens, node_list) if len(strs) > 0: alg = "subordi" return strs, alg else: strs = adverb.simp_adverb_sent(tokens, node_list) if len(strs) > 0: alg = "adverb" return strs, alg else: strs = parti.simp_parti_sent(tokens, node_list) if len(strs) > 0: alg = "parti" return strs, alg else: strs = appos.simp_appos_sent(tokens, node_list) if len(strs) > 0: alg = "appos" return strs, alg else: strs = adjec.simp_adjec_sent(tokens, node_list) if len(strs) > 0: alg = "adjec" return strs, alg else: #strs = subordi.simp_subordi_sent(tokens, node_list) strs = coordi.simp_coordi_sent(tokens, node_list) if len(strs) > 0: alg = "coordi" return strs, alg else: strs = passive.simp_passive_sent(tokens, node_list) if len(strs) > 0: alg = "passive" return strs, alg else: strs = relcl.simp_relcl_sent(tokens, node_list) if len(strs) > 0: alg= "relcl" return strs, alg return strs, alg
from nltk.tag.stanford import StanfordNERTagger, StanfordPOSTagger from nltk.tokenize import StanfordTokenizer from wordsegment import load, segment CUR_DIRECTORY = '/home/wmq/Desktop/DeepText/StanfordNLP' SEGMENT_PATH = CUR_DIRECTORY + '/stanford-segmenter-3.8.0.jar' NER_MODEL_PATH = CUR_DIRECTORY + '/english.all.3class.distsim.crf.ser.gz' NER_JAR_PATH = CUR_DIRECTORY + '/stanford-ner.jar' POS_MODEL_PATH = CUR_DIRECTORY + '/english-left3words-distsim.tagger' POS_JAR_PATH = CUR_DIRECTORY + '/stanford-postagger.jar' ner_tagger = StanfordNERTagger(NER_MODEL_PATH, NER_JAR_PATH, java_options='') pos_tagger = StanfordPOSTagger(POS_MODEL_PATH, POS_JAR_PATH, java_options='') tokenizer = StanfordTokenizer(SEGMENT_PATH) load() s = "@user nah pretty sure it's jackson's great jokes" ws = tokenizer.tokenize(s) print(' '.join(ws)) # print (' '.join(segment('#happythankgiving'))) # s = 'i got to to go formal with my best friend @ phi mu at jsu'.split() # ner_sent = ner_tagger.tag(s) # pos_sent = pos_tagger.tag(s) # print (ner_sent) # print (pos_sent)
java_path = "C:/Program Files/Java/jre1.8.0_131/bin/java.exe" parser_path = "D:/stanford-parser-full-2016-10-31/stanford-parser.jar" models_path = "D:/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar" engPCFG_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" import os os.environ['JAVA_HOME'] = java_path import sys from nltk.tokenize import StanfordTokenizer tokenizer = StanfordTokenizer(parser_path) from nltk.parse.stanford import StanfordDependencyParser parser = StanfordDependencyParser(parser_path, models_path, engPCFG_path) from nltk.corpus import wordnet import nltk from nltk.tree import Tree from nltk.corpus.reader.wordnet import Synset from nltk.corpus import semcor from nltk.corpus import wordnet from nltk.wsd import lesk noun = set(['NN', 'NNS', 'NNP', 'NNPS']) verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']) adjective = set(['JJ', 'JJR', 'JJS']) adverb = set(['RB', 'RBR', 'RBS']) substantive = noun | verb | adjective | adverb
else: result.append([k1,k2]) i = i+2 else: i=i+1 return result if __name__ == '__main__':#very important # res = request([["excellent"],["poor"]]) poshit = 1510000000032 neghit = 771000000037 print(poshit) print(neghit) stopword = ["-LSB-","-RSB-","-LRB-","-RRB-"] tokenizer = StanfordTokenizer() filename = "F:/course/sentimentcode/rt-polarity.neg" file_object = codecs.open(filename,'r','utf-8') allres = [] try: all_the_text = file_object.read() arr = tokenizer.tokenize(all_the_text) la = len(arr) correct = 0 for line in arr: ax = line.split() wordarr = [] for word in ax: if word in stopword: continue wordarr.append(word)