def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.status = 0 self.trans = googletrans.Translator() self.sentence = sentence.strip("\n").replace(" ", "") en_trans = self.trans.translate(sentence).text en_trans = sg.tokenize(en_trans) try: tree = list(en_parser.parse(en_trans)) self.tree = tree[0] # print(self.tree) self.rel = [] except: self.status = 1
def clean_content(page, verbose=True): content = page.content tag_name = '' ret_content = [] for line in content.splitlines(): match = re.match('=+ +(.+)? +=+', line) # タグかどうか,そうでなければリストに追加 if not match: if len(line) and not tag_name in clean_content.omit_sections: ret_content.append(line) continue # タグ名更新 tag_name = match.group(1) match = re.match('(.+)?Edit', tag_name) if match: tag_name = match.group(1) # 文頭大文字 文末ピリオド st = StanfordTokenizer() ret_tokens = [] for idx, line in enumerate(ret_content): if verbose: sys.stdout.write('\rParsing "%s" %d / %d'%(page.title, idx+1, len(ret_content))) sys.stdout.flush() tokens = st.tokenize(line) indices = [0]+[i+1 for i, e in enumerate(tokens) if e in ['.','!','?']] subtokens = [tokens[indices[i]:indices[i+1]] for i in range(len(indices)-1)] ret_tokens.extend(filter(lambda tokens: tokens[0][0].isupper(), subtokens)) if verbose: sys.stdout.write('\n'); sys.stdout.flush() return '\n'.join([' '.join(line) for line in ret_tokens])
def tokenize(content): """Breaks up text-based content into tokens in the style of PTB corpus""" _path_to_jar = os.path.abspath( 'summarize/stanford-postagger/stanford-postagger.jar') token_list = [] st = StanfordTokenizer(path_to_jar=_path_to_jar) content = content.lower() token_list = st.tokenize(content) return token_list
def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True})
def tokenize_q(qa, phase): qas = len(qa) MyTokenizer = StanfordTokenizer() for i, row in enumerate((qa)): row['question_toked'] = MyTokenizer.tokenize( row['question'].lower())[:14] if i % 50000 == 0: json.dump(qa, open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w')) if i == qas - 1: json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
def tokenize_stopwords_stemmer(texts): #用斯坦福的分词采用这一段,用普通分词时不用这个 #tokenize Str_texts=texts[0] #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") #path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子 java_path = 'C:/Program Files/Java/jdk1.8.0_121/bin/java.exe' os.environ['JAVAHOME'] = java_path texts_tokenized=tokenizer.tokenize(Str_texts)#输入必须是字符串,进行分词 #print(texts_tokenized) p1=r'[-@<#$%^&*].+' pa1=re.compile(p1) #re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例) texts_filtered0 = [ document for document in texts_tokenized if not document in pa1.findall(document) ] p2=r'.+[-_\/].+' #将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式 pa2=re.compile(p2) texts_filtered=[] for document in texts_filtered0: if document in pa2.findall(document): if document.find('_')>-1 : #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list) texts_filtered = texts_filtered + document.split('_') elif document.find('-')>-1: texts_filtered = texts_filtered + document.split('-') elif document.find('.')>-1: texts_filtered = texts_filtered + document.split('.') elif document.find('/')>-1: texts_filtered = texts_filtered + document.split('/') else: texts_filtered.append(document) texts_filtered = [ document for document in texts_filtered if document != '' and document != "''" and document != "``" ]#过滤掉空格,单引号和-- #stopwords english_stopwords = stopwords.words('english')#得到停词 texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords]#过滤掉停词 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','\n' ,'<','>','/','\"','\'','{','}','!','~','`' ,'$','^','/*','*/','/**','**/','**','-','_','+','=',r'-?-',r'@?']#得到标点 texts_filtered = [ document for document in texts_filtered_stopwords if not document in english_punctuations]#过滤掉标点 #print texts_filtered temp = texts_filtered[:] #实现去除带'comment'元素的代码 for i in temp: if 'comment' in i: texts_filtered.remove(i) #print(texts_filtered) #texts_filtered=[re.sub(r'^[1-9]\d*$'.format(punctuation), '', x) for x in texts_filtered] # ^[1-9]\d*$过滤掉整数 porter = nltk.PorterStemmer() #词干提取算法 texts_Stemmered=[porter.stem(t) for t in texts_filtered] #列表类型,提取词干 return texts_Stemmered #返回一个列表
def __init__(self): # set envirinment variable # TO DO: update to Docker path os.environ['CLASSPATH'] = resource_filename(__name__, 'tokenizers/') # load tokenizer and tagger # TO DO: again, update to Docker path self.STANFORD_TOKENIZER = StanfordTokenizer( resource_filename(__name__, 'tokenizers/stanford-ner-3.6.0.jar')) self.SMO_tagger = StanfordNERTagger( resource_filename(__name__, 'classifiers/ner-orgs_2016-03-28_all.ser.gz'))
def spans(txt): english_tokenizer = StanfordTokenizer( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar', options={ "americanize": True, }, java_options='-mx1000m') tokens = english_tokenizer.tokenize(txt) offset = 0 for token in tokens: offset = txt.find(token, offset) yield token, offset, offset + len(token) offset += len(token)
def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence # We use these to separate the summary sentences in the .bin datafiles self.SENTENCE_START = '<s>' self.SENTENCE_END = '</s>' self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True})
def __init__(self, sentence): en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar', path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar', ) sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar') self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).text) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel=[]
def stanfordTokenizer ( rawText ): """ Uses Stanford University's natural language processing lab tokenizer to split raw text. """ jarPath = "/Users/Nathan/nltk_data/stanford-postagger.jar" stanfordOptions = { "americanize": True, "ptb3Escaping": False } stanfordTokenizer = StanfordTokenizer( jarPath, 'UTF-8', stanfordOptions ) return stanfordTokenizer.tokenize( rawText )
def get_sentence_embeddings(sentences, ngram='bigrams', model='concat_wiki_twitter'): """ Returns a numpy matrix of embeddings for one of the published models. It handles tokenization and can be given raw sentences. Arguments: - ngram: 'unigrams' or 'bigrams' - model: 'wiki', 'twitter', or 'concat_wiki_twitter' - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...] """ wiki_embeddings = None twitter_embbedings = None tokenized_sentences_NLTK_tweets = None tokenized_sentences_SNLP = None if model == "wiki" or model == 'concat_wiki_twitter': tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(sentences) # just a trick to make things faster tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ') assert (len(tokenized_sentences_SNLP) == len(sentences)) wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_PG_3KBOOKS_BIGRAMS, FASTTEXT_EXEC_PATH) if model == "wiki": return wiki_embeddings elif model == "concat_wiki_twitter": return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1) sys.exit(-1)
class tokenizer: def __init__(self): self.stanford_tokenizer = \ StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\ ,options={"americanize": False}) pass #tokenize with stanford_parser def stanford_tokenize(self, row): temp_list = self.stanford_tokenizer.tokenize(row) return temp_list #tokenize with nltk word_tokenizer def word_tokenize(self, row): temp_list = nltk.word_tokenize(row) list_length = len(temp_list) index_list = list() for i in xrange(list_length): if temp_list[i].startswith('\''): if len(temp_list[i]) > 3: temp_list[i] = temp_list[i][1:] index_list.append(i) #end for count = 0 for index in index_list: temp_list.insert(index + count, '\'') count += 1 #end for return temp_list def no_block(self, string): string = re.sub(r' ', '', string) return len(string)
class tokenizer: def __init__(self): self.stanford_tokenizer = \ StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\ ,options={"americanize": False}); pass; #tokenize with stanford_parser def stanford_tokenize(self,row): temp_list = self.stanford_tokenizer.tokenize(row); return temp_list; #tokenize with nltk word_tokenizer def word_tokenize(self,row): temp_list = nltk.word_tokenize(row); list_length = len(temp_list); index_list = list(); for i in xrange(list_length): if temp_list[i].startswith('\''): if len(temp_list[i]) > 3: temp_list[i] = temp_list[i][1:]; index_list.append(i); #end for count = 0; for index in index_list: temp_list.insert(index+count,'\''); count+=1; #end for return temp_list; def no_block(self ,string): string = re.sub(r' ','',string); return len(string);
def __init__( self, singleword_spells, multiword_spells, tokenize_by="text", #tokenize_by="sentence", punkt_tokenizer='tokenizers/punkt/english.pickle', path_stanford_jar="/home/david/Descargas/stanford-corenlp-3.8.0.jar" ): self.singleword_spells = singleword_spells self.multiword_spells = multiword_spells self.multiword_spells_joint = [ "_".join(s.split()) for s in multiword_spells ] self.tokenize_by = tokenize_by self.toktok = StanfordTokenizer(path_to_jar=path_stanford_jar) self.sent_detector = nltk.data.load(punkt_tokenizer)
def stanfordNERInit(): os.environ[ 'CLASSPATH'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/stanford-ner.jar:C:/users/home/stanford-ner/stanford-ner-2017-06-09/lib/*:C:/users/home/stanford-postagger-full-2017-06-09/stanford-postagger.jar' os.environ[ 'STANFORD_MODELS'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/classifiers/' sent_detection = nltk.data.load('tokenizers/punkt/english.pickle') st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') tokenizer = StanfordTokenizer() return sent_detection, st, tokenizer
def __init__(self, data, path, all_names): self.data = data self.path = path self.english_postagger = StanfordPOSTagger( path + 'models/english-left3words-distsim.tagger', path + 'lib/stanford-postagger-3.4.1.jar', java_options='-Xmx2g') self.english_tokenizer = StanfordTokenizer( path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8') self.all_names = all_names self.pos = self.extract_POS() self.nms = self.extract_names() self.wg1 = self.extract_wordgrams(1) self.wg2 = self.extract_wordgrams(2) self.cg1 = self.extract_chargrams(1) self.cg2 = self.extract_chargrams(2) self.cg3 = self.extract_chargrams(3) self.bl = self.extract_breaklines() self.ws = self.extract_websites()
def sentence_embeddings(wikiuni, snlpjar, fasttext, sentences, ngram='unigrams', model='concat_wiki_twitter'): """ Generate embeddings from a list of sentences. Parameters: ----------- wikiuni: string Path to the Wikipedia embeddings parser: string Path to the folder containing the Stanford Parser jar: string Path to the JAR file of the Stanford tagger fasttext: string Path to the executable of FastText sentences: list List containing raw sentences e.g., ['Once upon a time', 'This is another sentence.', ...] ngram: string (unigram|bigram) ngram used in Wikipedia embeddings model: string (wiki|twitter|concat_wiki_twitter) """ wiki_embeddings = None twitter_embbedings = None tokenized_sentences_NLTK_tweets = None tokenized_sentences_SNLP = None if model == "wiki" or model == 'concat_wiki_twitter': tknzr = StanfordTokenizer(snlpjar, encoding='utf-8') s = ' <delimiter> '.join(sentences) #just a trick to make things faster tkn_sentences_SNLP = tokenize_sentences(tknzr, [s]) tkn_sentences_SNLP = tkn_sentences_SNLP[0].split(' <delimiter> ') assert(len(tkn_sentences_SNLP) == len(sentences)) if ngram == 'unigrams': wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \ wikiuni, fasttext) # We are not using Twitter or Bigrams so far # else: # wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \ # MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH) # if model == "twitter" or model == 'concat_wiki_twitter': # tknzr = TweetTokenizer() # tkn_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences) # if ngram == 'unigrams': # twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \ # MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH) # else: # twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \ # MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH) # if model == "wiki": return wiki_embeddings #elif model == "twitter": # return twitter_embbedings #elif model == "concat_wiki_twitter": # return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1) sys.exit(-1)
def Tok_handler(self, sentence, parser): if parser == "spacy": try: import spacy, en_core_web_sm except ImportError: print("Can't import spacy") nlp = en_core_web_sm.load() doc = nlp(sentence) return [str(token) for token in doc] elif parser == "nltk": try: import nltk from nltk.tokenize.stanford import StanfordTokenizer os.environ["CLASSPATH"] = "./StanfordNLP/jars" os.environ["STANFORD_MODELS"] = "./StanfordNLP/models" except ImportError: print("Can't import spacy") tokenizer = StanfordTokenizer() return tokenizer.tokenize(sentence)
class Tokenizer(object): """ Tokenize sentence """ def __init__(self, jar_path): self.tokenizer = StanfordTokenizer(jar_path) def tokenize(self, sentence): return self.tokenizer.tokenize(sentence) def __call__(self, sentence): return self.tokenize(sentence)
def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).get_text()) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel = []
class Preprocessor: def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence # We use these to separate the summary sentences in the .bin datafiles self.SENTENCE_START = '<s>' self.SENTENCE_END = '</s>' self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True}) def tokenize(self, article): return self.tokenizer.tokenize(article) def fix_missing_period(self, line): """Adds a period to a line that is missing a period""" if line == "": return line if line[-1] in self.END_TOKENS: return line # print line[-1] return line + " ." def adjust_article(self, article): #takes the article t # Lowercase everything lines = [line.lower() for line in article] # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences) lines = [self.fix_missing_period(line) for line in lines] # Separate out article and abstract sentences article_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: article_lines.append(line) # Make article into a single string article = ' '.join(article_lines) # # Make abstract into a signle string, putting <s> and </s> tags around the sentences # abstract = ' '.join(["%s %s %s" % (self.SENTENCE_START, sent, self.SENTENCE_END) for sent in highlights]) return article
def get_sentence_embeddings(sentences, train, d): """ Returns a numpy matrix of embeddings for one of the published models. It handles tokenization and can be given raw sentences. Arguments: - ngram: 'unigrams' or 'bigrams' - model: 'wiki', 'twitter', or 'concat_wiki_twitter' - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...] """ tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(sentences) #just a trick to make things faster tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])[0] # tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ') assert (len(tokenized_sentences_SNLP) == len(sentences)) wiki_embeddings = get_embeddings_for_preprocessed_sentences( tokenized_sentences_SNLP, MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH, train, d) return wiki_embeddings
class Preprocessor: def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True}) def tokenize(self, text): return self.tokenizer.tokenize(text) def fix_missing_period(self, line): """Adds a period to a line that is missing a period""" if line == "": return line if line[-1] in self.END_TOKENS: return line return line + " ." def preprocess_text(self, text): """Preprocesses and prepares input text for summarization""" # Lowercase everything lines = [line.lower() for line in text] lines = [self.fix_missing_period(line) for line in lines] # Separate out text text_lines = [] for idx, line in enumerate(lines): if line == "": continue # empty line else: text_lines.append(line) # Make text into a single string text = ' '.join(text_lines) return text
def __init__(self,mode=None): self.config = GetConfig() if mode: self.mode = mode else: if self.config.has_option(self.MY_ID,'mode'): self.mode = self.config.get(self.MY_ID,'mode') else: self.mode = 'NLTK' if self.mode == 'STANFORD': from nltk.tokenize.stanford import StanfordTokenizer as Tokenizer self.tokenizer = Tokenizer() elif self.mode == 'NLTK': pass elif self.mode == 'MINE': self.spacePunct = re.compile(ur'[`~!@#\$%\^&\*\(\)\[\]{}_\+\-=\|\\:;\"\'<>,\?/]') self.removePunct = re.compile(ur'\.') else: raise Exception('Error: tokenizer, Unknown mode %s!' %(self.mode))
def __init__(self, paths_json): set_environment_paths(paths_json) self.sentence_sequences = [] self.valence_sequences = [] self.sentence_trees = [] self.valence_trees = [] self.CompleteWordIndices = [] self.model = "" self.models2run = [] self.neg_scope_method = "" self.neg_res_method = "" self.sent_comp_method = "" valence_dict_path = paths_json["VALENCE_DICT"] with open(valence_dict_path) as json_file: self.VALENCE_DICT = json.loads(json_file.read()) negtool_negscopes_path = paths_json["NEGTOOL_NEGSCOPE"] self.negtool_neg_scopes_file = open(negtool_negscopes_path, "r") self.negtool_neg_scopes_file_current_line = 0 self.use_negtool = False meaning_spec_distribution_dict_path = paths_json[ "MEANING_SPEC_DISTRIBUTION_DICT_PATH"] with open(meaning_spec_distribution_dict_path) as json_file: self.distribution_dict = json.loads(json_file.read()) #window neg scope self.window_size = 4 self.review_id = 0 self.sentence_id = 0 #for negtool purposes #constants self.contractions = ["n't", "'m", "'ll", "'d", "'s", "'ve", "'re"] #parser and tokenizer initialization # self.PARSER = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") self.TOKENIZER = StanfordTokenizer() #using server self.CORENLP = StanfordCoreNLP('http://localhost:9000')
class tokenizer(object): MY_ID = 'TOKENIZER' def __init__(self,mode=None): self.config = GetConfig() if mode: self.mode = mode else: if self.config.has_option(self.MY_ID,'mode'): self.mode = self.config.get(self.MY_ID,'mode') else: self.mode = 'NLTK' if self.mode == 'STANFORD': from nltk.tokenize.stanford import StanfordTokenizer as Tokenizer self.tokenizer = Tokenizer() elif self.mode == 'NLTK': pass elif self.mode == 'MINE': self.spacePunct = re.compile(ur'[`~!@#\$%\^&\*\(\)\[\]{}_\+\-=\|\\:;\"\'<>,\?/]') self.removePunct = re.compile(ur'\.') else: raise Exception('Error: tokenizer, Unknown mode %s!' %(self.mode)) def tokenize(self, sent): if sent.endswith('-') or sent.endswith('~'): sent += ' ' sent = sent.replace('~ ', ' ~ ') sent = sent.replace('- ', ' - ') if self.mode == 'STANFORD': tokens = self.tokenizer.tokenize(sent.strip()) elif self.mode == 'NLTK': tokens = nltk.word_tokenize(sent.strip()) elif self.mode == 'MINE': new_sent = sent.strip() new_sent = self.spacePunct.sub(' ', new_sent) new_sent = self.removePunct.sub('', new_sent) tokens = new_sent.split() p_sent = ' '.join(tokens) p_sent = p_sent.replace('% ', '%') p_sent = p_sent.replace('``', '\"') p_sent = p_sent.replace('\'\'', '\"') p_tokens = p_sent.split(' ') return p_tokens
def tokenize(corenlp, review, span=False): r_dict = corenlp._request('ssplit', review) tokens2 = StanfordTokenizer().tokenize(review) print(r_dict) print(tokens2) tokens = [ token['word'] for s in r_dict['sentences'] for token in s['tokens'] ] sentences = [] current_sentence = [] for token in tokens: if (not bool(re.compile(r'[^\!\?]').search(token)) or token == "."): #only ! or ? current_sentence.append(token) sentences.append(current_sentence) current_sentence = [] else: current_sentence.append(token) #return [" ".join(sentence[:-1])+sentence[-1] for sentence in sentences] #return sentences return sentences #return tokenized sentences
def stanford_tokenize(s): return StanfordTokenizer().tokenize(s)
gold = [] for each in lis: item = each.split("\t") gold.append(item[1]) item_email = ' '.join(e for e in item[2:]) item_email = item_email.replace('</br>',' ').replace(':',' ')#.split() #print(item_email) emails.append(item_email) emails_len.append(len(item_email)) max_words_email = max(emails_len) total_email = len(emails) embedding_size = 700 all_mail_vec = [] tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(emails) tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) emails = tokenized_sentences_SNLP[0].split(' <delimiter> ') embs_email = model.embed_sentences(emails) #each_mail_vec = np.zeros((max_words_email, embedding_size)) #print(vector.shape) f.close() f = open('emails_dataset/emailExplanations_Dec23.sorted.txt','r') concepts = {'REMINDER':[], 'HUMOR' : [], 'EVENT': [], 'EMPLOYEE': [], 'MEETING' : [], 'POLICY' : [], 'CONTACT' : []}
import emoji from nltk.tokenize.stanford import StanfordTokenizer s = "Good muffins :-X cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." tokens = StanfordTokenizer().tokenize(s) print(tokens) a = {'a', 'b', 'c'} b = {'b', 'c', 'd'} c = {'a', 'd', 'e'} d = a | b | c print(d)
reader_obj = csv.reader(csvfile) for row in reader_obj: # row[0] -> id # row[1] -> title # row[2] -> content # row[3] -> tags #soup = BeautifulSoup(row[2]) #if soup.code != None: # codecount += 1 title_list += " " + row[1] check = int(time.time() - start) if check >= 10: print count, time.time() - start, "seconds" start = time.time() if not count % 10000: word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\ ).tokenize(title_list) fout.write(' '.join(word_list).encode('utf-8') + "\n") title_list = "" print(count) #word_list = nltk.word_tokenize(row[1].encode('utf-8')) #if not codecount%10000: # print codecount count += 1 #print(soup.get_text()) #pdb.set_trace() word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\ ).tokenize(title_list) fout.write(' '.join(word_list).encode('utf-8') + "\n") title_list = [] print(count)
print("Tokenizing all requests.") tweet_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True) tokenized_datasets_original_tweet = [[ tweet_tokenizer.tokenize(request) for request in dataset ] for dataset in datasets] print("Retokenizing with Stanford tokenizer. This may take a long time.") path_pos = "/playpen/home/tongn/stanford-postagger-full-2017-06-09/" jar_pos = "stanford-postagger.jar" tokenizer = StanfordTokenizer(path_pos + jar_pos) tokenizer = StanfordTokenizer(tagger_path) # tokenized_datasets_original = [ # [tokenizer.tokenize(' '.join(request).strip()) # for request in dataset] # for dataset in tokenized_datasets_original_tweet] tokenized_datasets_original = tokenized_datasets_original_tweet """ Convert all tokens to lowercase """ tokenized_datasets = [[[token.lower() for token in request] for request in dataset] for dataset in tokenized_datasets_original] """ Build the whole vocabulary
def test_stanford_tokenizer(): files = os.listdir("/Users/ruben/Desktop/txt/") standfor = StanfordTokenizer() total = sum(len(standfor.tokenize(readfile(DOCS_TXT_ROOT + f))) for f in files) print "\nStanfordTokenizer total " + str(total)
class Values(object): def __init__(self, data, path, all_names): self.data = data self.path = path self.english_postagger = StanfordPOSTagger( path + 'models/english-left3words-distsim.tagger', path + 'lib/stanford-postagger-3.4.1.jar', java_options='-Xmx2g') self.english_tokenizer = StanfordTokenizer( path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8') self.all_names = all_names self.pos = self.extract_POS() self.nms = self.extract_names() self.wg1 = self.extract_wordgrams(1) self.wg2 = self.extract_wordgrams(2) self.cg1 = self.extract_chargrams(1) self.cg2 = self.extract_chargrams(2) self.cg3 = self.extract_chargrams(3) self.bl = self.extract_breaklines() self.ws = self.extract_websites() def getVals(self): return self.bl, self.wg1, self.wg2, self.ws, self.nms, self.pos, self.cg1, self.cg2, self.cg3 def extract_POS(self): return self.english_postagger.tag( self.english_tokenizer.tokenize(self.data)) def extract_websites(self): websites = [] result = re.findall('href=\"(.*?)\"', self.data) for r in result: if (r == 'mailto:') or (r == 'http:///'): continue else: websites.append(r) return websites def extract_breaklines(self): breaklines = [] idx_old = 0 idx_new = self.data.find('<br>') breaklines.append(idx_new - idx_old) idx_old = idx_new while idx_old < len(self.data): idx_new = self.data.find('<br>', idx_old + 4) if (idx_new == -1): break breaklines.append(idx_new - idx_old) idx_old = idx_new return breaklines def extract_chargrams(self, gram_size): return [ ''.join(self.data[i:i + gram_size]) for i in range(len(self.data) - gram_size + 1) ] def extract_wordgrams(self, gram_size): r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format( re.escape(punctuation))) word_list = r.split(self.data) #word_list = re.split('\W+', self.data) #word_list = re.split(r'[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data) word_list = filter(None, word_list) return [ ''.join(word_list[i:i + gram_size]) for i in range(len(word_list) - gram_size + 1) ] def extract_names(self): r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format( re.escape(punctuation))) word_list = r.split(self.data) #word_list = re.split('\W+', self.data) #word_list = re.split('[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data) word_list = filter(None, word_list) word_list = [x.lower() for x in word_list] return list(set(word_list) & set(self.all_names))
#sqlite3 connection dbname = '/home/aahu/Dropbox/ryancompton.net/assets/praw_drugs/drugs.db' conn = sqlalchemy.create_engine('sqlite+pysqlite:///' + dbname, module=sqlite3.dbapi2) def load_subreddit(tablename, conn): df = pd.read_sql(tablename, conn) return df # <codecell> from nltk.tokenize.stanford import StanfordTokenizer stanfordTokenizer = StanfordTokenizer( path_to_jar= '/home/aahu/Downloads/stanford-corenlp-full-2015-01-30/stanford-corenlp-3.5.1.jar' ) def my_tokenize(text): return nltk.word_tokenize(text) #return nltk.wordpunct_tokenize(text) #return stanfordTokenizer.tokenize(text) #return nltk.tokenize.TreebankWordTokenizer().tokenize(text) def build_tfidf_transformer(docs=[], tokenizer=my_tokenize, max_doc_count=2000, vocab_limit=10000): """
def __init__(self): self.stanford_tokenizer = \ StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\ ,options={"americanize": False}); pass;