class tokenizer: def __init__(self): self.stanford_tokenizer = \ StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\ ,options={"americanize": False}); pass; #tokenize with stanford_parser def stanford_tokenize(self,row): temp_list = self.stanford_tokenizer.tokenize(row); return temp_list; #tokenize with nltk word_tokenizer def word_tokenize(self,row): temp_list = nltk.word_tokenize(row); list_length = len(temp_list); index_list = list(); for i in xrange(list_length): if temp_list[i].startswith('\''): if len(temp_list[i]) > 3: temp_list[i] = temp_list[i][1:]; index_list.append(i); #end for count = 0; for index in index_list: temp_list.insert(index+count,'\''); count+=1; #end for return temp_list; def no_block(self ,string): string = re.sub(r' ','',string); return len(string);
def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.status = 0 self.trans = googletrans.Translator() self.sentence = sentence.strip("\n").replace(" ", "") en_trans = self.trans.translate(sentence).text en_trans = sg.tokenize(en_trans) try: tree = list(en_parser.parse(en_trans)) self.tree = tree[0] # print(self.tree) self.rel = [] except: self.status = 1
class tokenizer: def __init__(self): self.stanford_tokenizer = \ StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\ ,options={"americanize": False}) pass #tokenize with stanford_parser def stanford_tokenize(self, row): temp_list = self.stanford_tokenizer.tokenize(row) return temp_list #tokenize with nltk word_tokenizer def word_tokenize(self, row): temp_list = nltk.word_tokenize(row) list_length = len(temp_list) index_list = list() for i in xrange(list_length): if temp_list[i].startswith('\''): if len(temp_list[i]) > 3: temp_list[i] = temp_list[i][1:] index_list.append(i) #end for count = 0 for index in index_list: temp_list.insert(index + count, '\'') count += 1 #end for return temp_list def no_block(self, string): string = re.sub(r' ', '', string) return len(string)
def clean_content(page, verbose=True): content = page.content tag_name = '' ret_content = [] for line in content.splitlines(): match = re.match('=+ +(.+)? +=+', line) # タグかどうか,そうでなければリストに追加 if not match: if len(line) and not tag_name in clean_content.omit_sections: ret_content.append(line) continue # タグ名更新 tag_name = match.group(1) match = re.match('(.+)?Edit', tag_name) if match: tag_name = match.group(1) # 文頭大文字 文末ピリオド st = StanfordTokenizer() ret_tokens = [] for idx, line in enumerate(ret_content): if verbose: sys.stdout.write('\rParsing "%s" %d / %d'%(page.title, idx+1, len(ret_content))) sys.stdout.flush() tokens = st.tokenize(line) indices = [0]+[i+1 for i, e in enumerate(tokens) if e in ['.','!','?']] subtokens = [tokens[indices[i]:indices[i+1]] for i in range(len(indices)-1)] ret_tokens.extend(filter(lambda tokens: tokens[0][0].isupper(), subtokens)) if verbose: sys.stdout.write('\n'); sys.stdout.flush() return '\n'.join([' '.join(line) for line in ret_tokens])
def tokenize(content): """Breaks up text-based content into tokens in the style of PTB corpus""" _path_to_jar = os.path.abspath( 'summarize/stanford-postagger/stanford-postagger.jar') token_list = [] st = StanfordTokenizer(path_to_jar=_path_to_jar) content = content.lower() token_list = st.tokenize(content) return token_list
def tokenize_stopwords_stemmer(texts): #用斯坦福的分词采用这一段,用普通分词时不用这个 #tokenize Str_texts=texts[0] #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") #path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子 java_path = 'C:/Program Files/Java/jdk1.8.0_121/bin/java.exe' os.environ['JAVAHOME'] = java_path texts_tokenized=tokenizer.tokenize(Str_texts)#输入必须是字符串,进行分词 #print(texts_tokenized) p1=r'[-@<#$%^&*].+' pa1=re.compile(p1) #re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例) texts_filtered0 = [ document for document in texts_tokenized if not document in pa1.findall(document) ] p2=r'.+[-_\/].+' #将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式 pa2=re.compile(p2) texts_filtered=[] for document in texts_filtered0: if document in pa2.findall(document): if document.find('_')>-1 : #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list) texts_filtered = texts_filtered + document.split('_') elif document.find('-')>-1: texts_filtered = texts_filtered + document.split('-') elif document.find('.')>-1: texts_filtered = texts_filtered + document.split('.') elif document.find('/')>-1: texts_filtered = texts_filtered + document.split('/') else: texts_filtered.append(document) texts_filtered = [ document for document in texts_filtered if document != '' and document != "''" and document != "``" ]#过滤掉空格,单引号和-- #stopwords english_stopwords = stopwords.words('english')#得到停词 texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords]#过滤掉停词 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','\n' ,'<','>','/','\"','\'','{','}','!','~','`' ,'$','^','/*','*/','/**','**/','**','-','_','+','=',r'-?-',r'@?']#得到标点 texts_filtered = [ document for document in texts_filtered_stopwords if not document in english_punctuations]#过滤掉标点 #print texts_filtered temp = texts_filtered[:] #实现去除带'comment'元素的代码 for i in temp: if 'comment' in i: texts_filtered.remove(i) #print(texts_filtered) #texts_filtered=[re.sub(r'^[1-9]\d*$'.format(punctuation), '', x) for x in texts_filtered] # ^[1-9]\d*$过滤掉整数 porter = nltk.PorterStemmer() #词干提取算法 texts_Stemmered=[porter.stem(t) for t in texts_filtered] #列表类型,提取词干 return texts_Stemmered #返回一个列表
def tokenize_q(qa, phase): qas = len(qa) MyTokenizer = StanfordTokenizer() for i, row in enumerate((qa)): row['question_toked'] = MyTokenizer.tokenize( row['question'].lower())[:14] if i % 50000 == 0: json.dump(qa, open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w')) if i == qas - 1: json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
class Tokenizer(object): """ Tokenize sentence """ def __init__(self, jar_path): self.tokenizer = StanfordTokenizer(jar_path) def tokenize(self, sentence): return self.tokenizer.tokenize(sentence) def __call__(self, sentence): return self.tokenize(sentence)
def spans(txt): english_tokenizer = StanfordTokenizer( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar', options={ "americanize": True, }, java_options='-mx1000m') tokens = english_tokenizer.tokenize(txt) offset = 0 for token in tokens: offset = txt.find(token, offset) yield token, offset, offset + len(token) offset += len(token)
class Preprocessor: def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence # We use these to separate the summary sentences in the .bin datafiles self.SENTENCE_START = '<s>' self.SENTENCE_END = '</s>' self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True}) def tokenize(self, article): return self.tokenizer.tokenize(article) def fix_missing_period(self, line): """Adds a period to a line that is missing a period""" if line == "": return line if line[-1] in self.END_TOKENS: return line # print line[-1] return line + " ." def adjust_article(self, article): #takes the article t # Lowercase everything lines = [line.lower() for line in article] # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences) lines = [self.fix_missing_period(line) for line in lines] # Separate out article and abstract sentences article_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: article_lines.append(line) # Make article into a single string article = ' '.join(article_lines) # # Make abstract into a signle string, putting <s> and </s> tags around the sentences # abstract = ' '.join(["%s %s %s" % (self.SENTENCE_START, sent, self.SENTENCE_END) for sent in highlights]) return article
def stanfordTokenizer ( rawText ): """ Uses Stanford University's natural language processing lab tokenizer to split raw text. """ jarPath = "/Users/Nathan/nltk_data/stanford-postagger.jar" stanfordOptions = { "americanize": True, "ptb3Escaping": False } stanfordTokenizer = StanfordTokenizer( jarPath, 'UTF-8', stanfordOptions ) return stanfordTokenizer.tokenize( rawText )
def __init__(self, sentence): en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar', path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar', ) sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar') self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).text) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel=[]
class Preprocessor: def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True}) def tokenize(self, text): return self.tokenizer.tokenize(text) def fix_missing_period(self, line): """Adds a period to a line that is missing a period""" if line == "": return line if line[-1] in self.END_TOKENS: return line return line + " ." def preprocess_text(self, text): """Preprocesses and prepares input text for summarization""" # Lowercase everything lines = [line.lower() for line in text] lines = [self.fix_missing_period(line) for line in lines] # Separate out text text_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: text_lines.append(line) # Make text into a single string text = ' '.join(text_lines) return text
class tokenizer(object): MY_ID = 'TOKENIZER' def __init__(self,mode=None): self.config = GetConfig() if mode: self.mode = mode else: if self.config.has_option(self.MY_ID,'mode'): self.mode = self.config.get(self.MY_ID,'mode') else: self.mode = 'NLTK' if self.mode == 'STANFORD': from nltk.tokenize.stanford import StanfordTokenizer as Tokenizer self.tokenizer = Tokenizer() elif self.mode == 'NLTK': pass elif self.mode == 'MINE': self.spacePunct = re.compile(ur'[`~!@#\$%\^&\*\(\)\[\]{}_\+\-=\|\\:;\"\'<>,\?/]') self.removePunct = re.compile(ur'\.') else: raise Exception('Error: tokenizer, Unknown mode %s!' %(self.mode)) def tokenize(self, sent): if sent.endswith('-') or sent.endswith('~'): sent += ' ' sent = sent.replace('~ ', ' ~ ') sent = sent.replace('- ', ' - ') if self.mode == 'STANFORD': tokens = self.tokenizer.tokenize(sent.strip()) elif self.mode == 'NLTK': tokens = nltk.word_tokenize(sent.strip()) elif self.mode == 'MINE': new_sent = sent.strip() new_sent = self.spacePunct.sub(' ', new_sent) new_sent = self.removePunct.sub('', new_sent) tokens = new_sent.split() p_sent = ' '.join(tokens) p_sent = p_sent.replace('% ', '%') p_sent = p_sent.replace('``', '\"') p_sent = p_sent.replace('\'\'', '\"') p_tokens = p_sent.split(' ') return p_tokens
def Tok_handler(self, sentence, parser): if parser == "spacy": try: import spacy, en_core_web_sm except ImportError: print("Can't import spacy") nlp = en_core_web_sm.load() doc = nlp(sentence) return [str(token) for token in doc] elif parser == "nltk": try: import nltk from nltk.tokenize.stanford import StanfordTokenizer os.environ["CLASSPATH"] = "./StanfordNLP/jars" os.environ["STANFORD_MODELS"] = "./StanfordNLP/models" except ImportError: print("Can't import spacy") tokenizer = StanfordTokenizer() return tokenizer.tokenize(sentence)
def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).get_text()) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel = []
class FanFictionHPSpellTokenizer(object): TOKENIZE_BY_SENTENCE = "sentence" TOKENIZE_BY_PARAGRAPH = "paragraph" TOKENIZE_AS_TEXT = "text" DUMMY_SEPARATOR = "DUMMY_SEPARATOR" def __init__( self, singleword_spells, multiword_spells, tokenize_by="text", #tokenize_by="sentence", punkt_tokenizer='tokenizers/punkt/english.pickle', path_stanford_jar="/home/david/Descargas/stanford-corenlp-3.8.0.jar" ): self.singleword_spells = singleword_spells self.multiword_spells = multiword_spells self.multiword_spells_joint = [ "_".join(s.split()) for s in multiword_spells ] self.tokenize_by = tokenize_by self.toktok = StanfordTokenizer(path_to_jar=path_stanford_jar) self.sent_detector = nltk.data.load(punkt_tokenizer) def tokenize(self, path): """ Tokenize one texts at a time is slow """ with codecs.open(path, encoding="utf-8") as f_fanfiction: fanfiction_story = f_fanfiction.read().lower() if self.tokenize_by == self.TOKENIZE_BY_SENTENCE: return self._tokenize_by_sentence(fanfiction_story) elif self.tokenize_by == self.TOKENIZE_BY_PARAGRAPH: return self._tokenize_by_paragraph(fanfiction_story) elif self.tokenize_by == self.TOKENIZE_AS_TEXT: return self._tokenize(fanfiction_story) else: raise NotImplementedError def _tokenize_by_sentence(self, text): output = [] sentences = self.sent_detector.tokenize(text.strip()) for s in sentences: for mws in self.multiword_spells: if mws in s: s = s.replace(mws, "_".join(mws.split(" "))) joined_sentences = " " + self.DUMMY_SEPARATOR + " ".join(sentences) new_sentences = " ".join(self.toktok.tokenize(joined_sentences)) return [ s.split(" ") for s in new_sentences.split(self.DUMMY_SEPARATOR) ] def _tokenize(self, text): output = [] sentences = " ".join(self.sent_detector.tokenize(text.strip())) for mws in self.multiword_spells: if mws in sentences: sentences = sentences.replace(mws, "_".join(mws.split(" "))) tokens = self.toktok.tokenize(sentences) output.append(tokens) return output def is_spell(self, token): """ Tokens must have been obtained after processing the text with the method tokenize() """ return token in self.multiword_spells_joint or token in self.singleword_spells
class SMOCoder: def __init__(self): # set envirinment variable # TO DO: update to Docker path os.environ['CLASSPATH'] = resource_filename(__name__, 'tokenizers/') # load tokenizer and tagger # TO DO: again, update to Docker path self.STANFORD_TOKENIZER = StanfordTokenizer( resource_filename(__name__, 'tokenizers/stanford-ner-3.6.0.jar')) self.SMO_tagger = StanfordNERTagger( resource_filename(__name__, 'classifiers/ner-orgs_2016-03-28_all.ser.gz')) def getSMO(self, text, as_str=False): ''' Extract social movement organizations from text using a custom trained Stanford NER tagger. :param text: text to extract social movement organizations from :type text: string :param as_str: logical indicating whether SMOs should be returned as a string. Defaults to False. :type as_str: boolean :return: SMOs extracted from text :rtype: set, or a string if as_str = True ''' # Tokenize. What to do about <br /> ? tokens = self.STANFORD_TOKENIZER.tokenize(text) # Run tagging. This returns a list of tuples # classified as 'ORGANIZATION' if an SMO, 'O' otherwise tags = self.SMO_tagger.tag(tokens) current_SMO = '' all_SMOs = [] # Note: Stanford NER tagger tags individual words as SMO or non-SMO. # For example, Black Lives Matter will be returned as ('Black', 'ORGANIZATION'), ('Lives', 'ORGANIZATION'), ('Matter', 'ORGANIZATION') # We want to parse this to a single organization. # # Non-perfect solution: assume that all consecutive ORGANIZATION tags represent a single SMO for tag in tags: # if tagged as organization, add to current SMO and skip ahead if 'ORGANIZATION' == tag[1]: if '' == current_SMO or "'" == tag[0]: current_SMO = current_SMO + tag[0] else: current_SMO = current_SMO + ' ' + tag[0] continue # adding test for unknown label if 'O' != tag[1]: print( 'Unknown tag ' + tag[1] + ', skipping ahead. Could be worth investigating further.') # add last detected organization to list and reset current_SMO if '' != current_SMO: all_SMOs.append(current_SMO) current_SMO = '' # get unique elements all_SMOs = set(all_SMOs) if as_str: return '; '.join(all_SMOs) else: return all_SMOs
with open('pairs_ronny{}.csv'.format(teststr), 'r+') as f: lines_pairs = f.readlines()[1:] with open('table_ronny.csv', 'r') as f: lines_table = f.readlines() # parse table entry_table = parse_table(lines_table, stanford) with open('dummy_tok.tables.jsonl', 'w') as f: json.dump(entry_table, f) # parse all pairs f = open('dummy_tok{}.jsonl'.format(teststr), 'w+') for line in lines_pairs: entry = dict(phase=2) # line.replace('"', '') # line = line.encode() line = line.split(',') sqlquery = line[0] question = line[1] entry['query'] = sqlquery entry['question'] = question entry['sql'] = parse_sql(sqlquery, entry_table['header']) entry['query_tok'] = stanford.tokenize(sqlquery) entry['question_tok'] = stanford.tokenize(question) entry['table_id'] = 'mock_time_machine' json.dump(entry, f) f.write('\n') f.close() os.system('cp dummy_tok{}.jsonl ../data/'.format(teststr))
CLASSPATH = CLASSPATH + ':' + path_api print(CLASSPATH) os.environ["CLASSPATH"] = CLASSPATH os.environ['STANFORD_PARSER'] = path_corenlp os.environ['STANFORD_MODELS'] = path_model sent = "Kalla, it\'s a dog!" from nltk.tokenize.stanford import StanfordTokenizer tokenizer = StanfordTokenizer() print(tokenizer.tokenize(sent)) from nltk.parse.stanford import StanfordParser class MyParser(StanfordParser): def raw_parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list of strings. Each sentence will be automatically tokenized and tagged by the Stanford Parser. The output format is `wordsAndTags`. :param sentences: Input sentences to parse :type sentences: list(str)
class Values(object): def __init__(self, data, path, all_names): self.data = data self.path = path self.english_postagger = StanfordPOSTagger( path + 'models/english-left3words-distsim.tagger', path + 'lib/stanford-postagger-3.4.1.jar', java_options='-Xmx2g') self.english_tokenizer = StanfordTokenizer( path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8') self.all_names = all_names self.pos = self.extract_POS() self.nms = self.extract_names() self.wg1 = self.extract_wordgrams(1) self.wg2 = self.extract_wordgrams(2) self.cg1 = self.extract_chargrams(1) self.cg2 = self.extract_chargrams(2) self.cg3 = self.extract_chargrams(3) self.bl = self.extract_breaklines() self.ws = self.extract_websites() def getVals(self): return self.bl, self.wg1, self.wg2, self.ws, self.nms, self.pos, self.cg1, self.cg2, self.cg3 def extract_POS(self): return self.english_postagger.tag( self.english_tokenizer.tokenize(self.data)) def extract_websites(self): websites = [] result = re.findall('href=\"(.*?)\"', self.data) for r in result: if (r == 'mailto:') or (r == 'http:///'): continue else: websites.append(r) return websites def extract_breaklines(self): breaklines = [] idx_old = 0 idx_new = self.data.find('<br>') breaklines.append(idx_new - idx_old) idx_old = idx_new while idx_old < len(self.data): idx_new = self.data.find('<br>', idx_old + 4) if (idx_new == -1): break breaklines.append(idx_new - idx_old) idx_old = idx_new return breaklines def extract_chargrams(self, gram_size): return [ ''.join(self.data[i:i + gram_size]) for i in range(len(self.data) - gram_size + 1) ] def extract_wordgrams(self, gram_size): r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format( re.escape(punctuation))) word_list = r.split(self.data) #word_list = re.split('\W+', self.data) #word_list = re.split(r'[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data) word_list = filter(None, word_list) return [ ''.join(word_list[i:i + gram_size]) for i in range(len(word_list) - gram_size + 1) ] def extract_names(self): r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format( re.escape(punctuation))) word_list = r.split(self.data) #word_list = re.split('\W+', self.data) #word_list = re.split('[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data) word_list = filter(None, word_list) word_list = [x.lower() for x in word_list] return list(set(word_list) & set(self.all_names))
strip_handles=True) tokenized_datasets_original_tweet = [[ tweet_tokenizer.tokenize(request) for request in dataset ] for dataset in datasets] print("Retokenizing with Stanford tokenizer. This may take a long time.") path_pos = "/playpen/home/tongn/stanford-postagger-full-2017-06-09/" jar_pos = "stanford-postagger.jar" tokenizer = StanfordTokenizer(path_pos + jar_pos) tokenizer = StanfordTokenizer(tagger_path) tokenized_datasets_original = [[ tokenizer.tokenize(' '.join(request).strip()) for request in dataset ] for dataset in tokenized_datasets_original_tweet] # tokenized_datasets_original = tokenized_datasets_original_tweet """ Convert all tokens to lowercase """ tokenized_datasets = [[[token.lower() for token in request] for request in dataset] for dataset in tokenized_datasets_original] """ Build the whole vocabulary Vocab lists: • special token: "UNK_TOKEN" • vocab_shared: intersection of word2vec vocab and politeness vocab • vocab_freq: frequent vocab that is not in word2vec vocab
def main(in_path, outpath): nltk.download() span_extractor = torch.load(os.path.join(EXPERIMENT, 'best_span_extractor.tar'), map_location='cpu') answer_verifier = torch.load(os.path.join(EXPERIMENT, 'best_answer_verifier.tar'), map_location='cpu') span_extractor.use_cuda = False answer_verifier.use_cuda = False tokenizer = StanfordTokenizer( options={'ptb3Escaping': True}) # same tokenizer used by lexical parser parser = StanfordParser(java_options='-mx5g') data = json.load(open(in_path, 'r'))['data'] batches = [] official_eval = {} official_eval_tokens = {} qaid_map = {} num_articles = len(data) for aidx in range(len(data)): article = data[aidx] print('\t- Article Count=%d/%d' % (aidx + 1, num_articles)) for pidx, paragraph in enumerate(article['paragraphs']): passage, qas = paragraph['context'], paragraph['qas'] passage = passage.replace(u'\xa0', ' ') sentences = sent_tokenize(passage) sentence_tokens = [ tokenizer.tokenize(sentence) for sentence in sentences ] raw_trees = [ list(s)[0] for s in list( parser.parse_sents(sentence_tokens, verbose=True)) ] squad_tree = TreePassage(raw_trees) for qidx, qa in enumerate(qas): question_sentences = sent_tokenize(qa['question']) question_tokens = [] for s in question_sentences: question_tokens += tokenizer.tokenize(s) batches.append( Batch([{ 'apid': 'apid', 'qa_id': qa['id'], 'context_squad_tree': squad_tree, 'question_tokens': question_tokens, 'answers': [], 'is_impossible': 0 }], False)) qaid_map[qa['id']] = paragraph['context'] span_extractor.eval() answer_verifier.eval() for idx, batch in enumerate(batches): qa_id = batch.qa_id[0] node_scores, expected_f1s, global_answer_score = span_extractor( batch, eval_system=True) score_confidence, predicted_node_idxs = node_scores.max(dim=1) score_confidence, predicted_node_idxs = (variable_to_numpy( score_confidence, False), variable_to_numpy(predicted_node_idxs, False)) # Answer score = predicted has answer probability answer_score = answer_verifier(batch, predicted_node_idxs=predicted_node_idxs, eval_system=True) answer_proba = variable_to_numpy( Sigmoid()(answer_score), False) # convert from tensor to numpy array global_answer_proba = variable_to_numpy(Sigmoid()(global_answer_score), False) has_answer_proba = (0.3 * score_confidence + 0.4 * global_answer_proba + 0.3 * answer_proba)[0] predicted_span = batch.trees[0].span(predicted_node_idxs[0]) predicted_has_answer = has_answer_proba >= HAS_ANSWER_THRESHOLD predicted_text = tokens_to_text(predicted_span, qaid_map[qa_id]) official_eval[qa_id] = predicted_text if predicted_has_answer else '' official_eval_tokens[qa_id] = ' '.join( predicted_span) if predicted_has_answer else '' json.dump(official_eval, open(outpath, 'w'))
@author: BurakKerim ''' from nltk.tokenize import wordpunct_tokenize from nltk.tag import StanfordPOSTagger from nltk.tokenize.stanford import StanfordTokenizer # May need to export JAVA_HOME, # use export in linux in a decent way import os java_path = '/usr/lib/jvm/jdk1.8.0_31' # java_path = 'C:\Program Files\Java\jdk1.8.0_31' os.environ['JAVAHOME'] = java_path tagger_home = '/media/burak/Data/Workspace/Library/'\ 'stanford-postagger-full-2018-02-27/' model = tagger_home + 'models/english-bidirectional-distsim.tagger' jar_file = tagger_home + 'stanford-postagger.jar' tagger = StanfordPOSTagger(model, path_to_jar=jar_file) tokenizer = StanfordTokenizer(path_to_jar=jar_file) sentence = 'This sentence is a test sentence for test in a test environment.' print(tagger.tag(wordpunct_tokenize(sentence))) print(tagger.tag(tokenizer.tokenize(sentence)))
def test_stanford_tokenizer(): files = os.listdir("/Users/ruben/Desktop/txt/") standfor = StanfordTokenizer() total = sum(len(standfor.tokenize(readfile(DOCS_TXT_ROOT + f))) for f in files) print "\nStanfordTokenizer total " + str(total)
len(relations) sentences = [] e1 = [] e2 = [] for j, line in enumerate(lines): text = [] temp = [] t = line.split("<e1>") text.append(t[0]) temp.append(t[0]) t = t[1].split("</e1>") e1_text = text e1_text = " ".join(e1_text) e1_text = tokenizer.tokenize(e1_text) text.append(t[0]) e11 = t[0] y = tokenizer.tokenize(t[0]) y[0] += "E11" temp.append(" ".join(y)) t = t[1].split("<e2>") text.append(t[0]) temp.append(t[0]) t = t[1].split("</e2>") e22 = t[0] e2_text = text e2_text = " ".join(e2_text) e2_text = tokenizer.tokenize(e2_text) text.append(t[0]) text.append(t[1])
tokenizer = StanfordTokenizer(path_to_jar=jar_file) s1 = "On a $50,000 mortgage of 30 years at 8 percent," " the monthly payment would be $366.88." s2 = "\"We beat some pretty good teams to get here.\" Slocum said." s3 = "Well, we couldn't have this predictable, cliche-ridden, " "\"Touched by an Angel\" " "(a show creator John Masius worked on) wanna-be if she didn't." s4 = "I cannot work under these conditions!" s5 = "The company spent $30,000,000 last year." p = [s1, s2, s3, s4, s4] par = ' '.join(p) for s in p: print(word_tokenize(s)) print(wordpunct_tokenize(s)) print(tokenizer.tokenize(s)) print() for s in sent_tokenize(par): print(word_tokenize(s)) print(wordpunct_tokenize(s)) print(tokenizer.tokenize(s)) print()
from nltk.tokenize.stanford import StanfordTokenizer import os os.environ['CLASSPATH'] = '/root/datasets/stanford-postagger-2018-10-16/' with open('input_question.txt', 'r') as f: raw_q = f.read() stanford = StanfordTokenizer() print(stanford.tokenize(raw_q))