def tokenization(corpus, stop_words=nltk.corpus.stopwords.words('portuguese')): '''Input : corpus é uma Serie de corpusumentos(frases) Output : Uma lista de listas com palavras stop_words : lista de palavras que devem ser removidas ''' #Tokenizacao spacetok = SpaceTokenizer() corpus = [spacetok.tokenize(phrases) for phrases in corpus] #stopwords if (stop_words != None): tmp_corpus = list() tmp_words = list() for phrases in corpus: for word in phrases: if (word not in stop_words): tmp_words.append(word) else: pass tmp_corpus.append(tmp_words) tmp_words = list() corpus = tmp_corpus else: pass return corpus
def displayPageView(request): mycursor.execute('TRUNCATE table logs_c') filePath = request.GET['input-file'] filePath = "C:/Users/Rhishabh/Documents/mithi hackathon/" + filePath log = readfile(filePath) line = log.readline() tk = SpaceTokenizer() tokens = tk.tokenize(line) while line: tokens = tk.tokenize(line) process(tokens) line = log.readline() mydb.commit() result1 = query_1() result2 = query2() result3 = query3() result4 = query4() result5 = query5() result7 = query7() # mydb.close() temp = [['test', 'test'], ['test', 'test']] test = 'sdsds' return render(request, 'display.htm', {'ipfile': filePath, 'result1': result1, 'result2': result2, 'result3': result3, 'result4': result4, 'result5': result5, 'result7': result7})
def tokenize(s): out = [] tokens = SpaceTokenizer().tokenize(s) for w in tokens: if w[:1] == "\n": out.append("\n") out.append(w[1:]) else: out.append(w) return out
def extract_name(tweet): token = SpaceTokenizer() toks = token.tokenize(tweet) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) nes = [ ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree) ] return nes
def read_sent(sent): sent = SpaceTokenizer().tokenize(sent) start = int(sent[0]) end = int(sent[1]) dataset = int(sent[2]) sent = sent[4:] labels = [0] * len(sent) if dataset != 0: labels[start:end + 1] = [1] * (end + 1 - start) return [(sent[i], str(labels[i])) for i in range(len(sent))]
def analyze_line(line): tokens = pos_tag(SpaceTokenizer().tokenize(line)) names = [] for token in tokens: if token[1] == 'NNP': names.append(re.sub('[' + string.punctuation + ']', '', token[0])) return { "names": names, "sentiment": SentimentIntensityAnalyzer().polarity_scores(line) }
def fun_1_1_5(): import nltk from nltk.tokenize import RegexpTokenizer from nltk.tokenize import regexp_tokenize tokenizer = RegexpTokenizer("[\w]+") print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") print "regexp_tokenizer:", regexp_tokenize( "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+") # 通过空格来执行切分 tokenizer = RegexpTokenizer('\s+', gaps=True) print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") # 筛选以大写字母开头的单词 sent = " She secured 90.56 % in class X \n. She is a meritorious student" capt = RegexpTokenizer('[A-Z]\w+') print "RegexpTokenizer:", capt.tokenize(sent) # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的 from nltk.tokenize import BlanklineTokenizer print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent) # 字符串的切分可以通过空格、间隔、换行等来完成 from nltk.tokenize import WhitespaceTokenizer print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent) # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其 # 切分为字母与非字母字符 from nltk.tokenize import WordPunctTokenizer print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent) # 使用 split()方法进行切分 print "split():", sent.split() print "split(' '):", sent.split(' ') print "split('\n'):", sent.split('\n') # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分 from nltk.tokenize import LineTokenizer print "LineTokenizer:", LineTokenizer().tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent) # SpaceTokenizer 与 sent.split('')方法的工作原理类似 from nltk.tokenize import SpaceTokenizer print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent) # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符 # 在语句中的位置和偏移量 print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent)) # 给定一个标识符的序列,则可以返回其跨度序列 from nltk.tokenize.util import spans_to_relative print "位置和偏移:", list( spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))) # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量: from nltk.tokenize.util import string_span_tokenize print "标识符序列:", list(string_span_tokenize(sent, " "))
def get_vocab(self, start_index=2, min_count=10): text = ''.join(list(self.publications['full_text'].values)) all_words = SpaceTokenizer().tokenize(text + text.lower()) vocab = Counter(all_words).most_common() vocab_out_json = {} for items in vocab: if items[1] > min_count: vocab_out_json[items[0].decode( 'utf-8', 'replace')] = len(vocab_out_json) + start_index print(len(vocab) - len(vocab_out_json), ' words are discarded as OOV') print(len(vocab_out_json), ' words are in vocab') with codecs.open(self.outdir + 'vocab.json', 'wb') as vocabfile: json.dump(vocab_out_json, vocabfile)
def read_doc(doc, labels): doc = SpaceTokenizer().tokenize(doc.strip()) # doc = doc.strip().split() labels = labels.strip().split('|') labels = [la.split() for la in labels] for i in range(len(labels)): for j in range(len(labels[i])): labels[i][j] = int(labels[i][j]) res_labels = [0] * len(doc) for la in labels: if la[2] != 0: start = la[0] end = la[1] res_labels[start:end + 1] = [1] * (end + 1 - start) return [(doc[i], str(res_labels[i])) for i in range(len(doc))]
def space_tokenizer(text, strip=None): ''' Only " " blank character Same as s.split(" ") >>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks." >>> SpaceTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', "It's", 'inexpensive.', 'Free-for-all.', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] >>> s.split(' ') ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', "It's", 'inexpensive.', 'Free-for-all.', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] >>>''' for token in SpaceTokenizer().tokenize(text): if token not in patterns.PUNCTUATION and not token.isspace(): yield token.strip(strip)
def tokenizarPorTipo(): cadena = "Sorry, I can't go to the meeting.\n" print("TreebankWordTokenizer - 1") print("WhitespaceTokenizer - 2") print("SpaceTokenizer - 3") print("WordPunctTokenizer - 4") num = input("Introduzca un tokenizer: ") if num == "1": tokenizer = TreebankWordTokenizer() elif num == "2": tokenizer = WhitespaceTokenizer() elif num == "3": tokenizer = SpaceTokenizer() elif num == "4": tokenizer = WordPunctTokenizer() else: return tokens = tokenizer.tokenize(cadena) print(tokens)
from nltk.tag import pos_tag import nltk.tokenize from nltk.corpus import cmudict from wordgen import gen_word from nltk import pos_tag, ne_chunk from nltk.tokenize import SpaceTokenizer sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus" tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(sentence) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) print chunked_nes nes = [ ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree) ] print nes ''' qry = "who is Mahatma Gandhi" tokens = nltk.tokenize.word_tokenize(qry) pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary = False) print sentt person = [] for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'): for leave in subtree.leaves(): person.append(leave) print "person=", person
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
from nltk import CFG,ChartParser from nltk.tokenize import SpaceTokenizer grammar = CFG.fromstring(""" S -> NP VP NP -> Det N VP -> IV Det -> 'the' N -> 'man' IV -> 'walks' """) #>>> grammar #<Grammar with 14 productions> #>>> grammar.start() #S #>>> grammar.productions() #[S -> NP VP, NP -> Det N, VP -> IV, Det -> 'the', N -> 'man', IV -> 'walks'] parser = ChartParser(grammar) parses = parser.parse_all(SpaceTokenizer().tokenize("the man walks")) #>>> parses #[Tree('S', [Tree('NP', [Tree('Det', ['the']), Tree('N', ['man'])]), Tree('VP', [Tree('IV', ['walks'])])])]
of comments. Words are stemmed, then each occurence of a word is counted. Each word is only counted once per comment. Out of the 300 most frequent words in the toxic category and the 300 most frequent words in the clean category, the vocabulary is selected as the words in the toxic category but not in the clean category. @author: Alex """ from nltk.tokenize import SpaceTokenizer from sklearn.feature_extraction import text import pandas as pd from nltk.stem.porter import PorterStemmer s_tok = SpaceTokenizer() porter = PorterStemmer() additional_stopwords = ['wiki', 'utc', 'wikipedia'] stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords) stemmed_stop_words = set([porter.stem(w) for w in stop_words]) def makeFrequencyDict(comments): ''' Make a dictionary of the frequency a word appears in the corpus. Frequencies are normalized by number of example comments. ''' n_examples = len(comments) freqDict = {}
def __init__(self): self.tokenizer = SpaceTokenizer()
protocol = tokens[5][1:-1] status = tokens[6] size = tokens[7][:-1] # print(ip + ' ' + date_time + ' ' + method + ' ' + url + ' ' + protocol + ' ' + status + ' ' + size) val = (ip, date_time, method, url, protocol, status, size) mycursor.execute(sql, val) # Type 0 -> Tab Seperated (Server 1) # Type 1 -> Space Seperated (Server 2) log = readfile("access_log") line = log.readline() tk = SpaceTokenizer() tokens = tk.tokenize(line) while line: tokens = tk.tokenize(line) process(tokens) line = log.readline() mydb.commit() print("records inserted.") # Top client ip addresses by number of requests sql = "SELECT IP, count(*) FROM logs_c GROUP BY IP ORDER BY count(*) DESC LIMIT 5" mycursor.execute(sql) results = mycursor.fetchall()
def get_test_docs(self): test_doc_ids = open(self.outdir + '/test_doc_ids', 'w+') test_docs = open(self.outdir + '/test_docs', 'w+') golden_data = open(self.outdir + '/test_doc_gold', 'w+') test_doc_list = [] for doc in set(self.data_set_citations['publication_id']): if np.random.randint( 0, 100) < 10 and doc not in self.zero_shot_doc_ids: test_doc_list.append(doc) test_doc_ids.write(str(doc) + '\n') logger.info(str(len(test_doc_list)) + ' test docs selected') pub_ids = list(self.data_set_citations['publication_id']) pos_tokens = 0 neg_tokens = 0 #to locate lines with relevant pubs for pub_id in test_doc_list: pub_text = self.full_text[str(pub_id) + '.txt'] test_docs.write(pub_text + '\n') pub_text_tokens = list(SpaceTokenizer().tokenize(pub_text)) pub_text_spans = list(SpaceTokenizer().span_tokenize(pub_text)) cur_pos_tokens = 0 cur_neg_tokens = len(pub_text_tokens) res_line = [] rows = [pub_ids.index(i) for i in pub_ids if i == pub_id] for idx in rows: d_row = self.data_set_citations.loc[idx] for mention_text in d_row['mention_list']: mention_text = re.sub('\d', ' ', mention_text) # mention_text = re.sub('[^ ]- ', '', mention_text) mention_text_spans = list( SpaceTokenizer().span_tokenize(mention_text)) index_finder_lower = findall_lower(mention_text, pub_text) found_indices = [idx for idx in index_finder_lower] for find_index in found_indices: try: if find_index != -1: new_mention_text_spans = [ (indices[0] + find_index, indices[1] + find_index) for indices in mention_text_spans ] cur_pos_tokens += len(mention_text_spans) res_line.append( (pub_text_spans.index( new_mention_text_spans[0]), pub_text_spans.index( new_mention_text_spans[-1]), d_row['data_set_id'], d_row['publication_id'])) except: pass res_line = list(set(res_line)) if len(res_line) == 0: # no mentions at all res_line.append((-1, -1, 0, pub_id)) i = 0 for c in res_line: if i > 0: golden_data.write(' | ' + str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2]) + ' ' + str(c[3])) else: golden_data.write( str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2]) + ' ' + str(c[3])) i += 1 golden_data.write('\n') pos_tokens += cur_pos_tokens neg_tokens += (cur_neg_tokens - cur_pos_tokens) test_doc_ids.close() test_docs.close() golden_data.close() logger.info(str(pos_tokens) + " pos tokens added.") logger.info(str(neg_tokens) + " neg tokens added.") logger.info("neg token percentage: {}".format( neg_tokens * 100 / (pos_tokens + neg_tokens)))
def text_pre_processing(text, remove_number=True, stop_word=True, stop_word_language='english', remove_punctuation=True): # --------------------------------------------- # Patterns results_chunk = '' results_named_entitiy = '' patterns1 = r'@[A-Za-z0-9_]+' pattterns2 = r'https?://[^ ]+' combined_patterns = r'|'.join((patterns1, pattterns2)) www_patterns = r'www.[^ ]+' negations_dic = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } negations_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') # --------------------------------------------- # convert to lower case results = str(text) # --------------------------------------------- # Text Cleaning results = re.sub(combined_patterns, '', results) results = re.sub(www_patterns, '', results) results = results.lower() results = negations_pattern.sub(lambda x: negations_dic[x.group()], results) results = re.sub("[^a-zA-Z]", " ", results) results = results.replace("(<br/>)", "") results = results.replace('(<a).*(>).*(</a>)', '') results = results.replace('(&)', '') results = results.replace('(>)', '') results = results.replace('(<)', '') results = results.replace('(\xa0)', ' ') # --------------------------------------------- if (remove_number) & (results != ''): results = re.sub(r'\d+', '', results) # --------------------------------------------- if remove_punctuation & (results != ''): translator = str.maketrans('', '', string.punctuation) results = results.translate(translator) # --------------------------------------------- # Remove whitespaces results = results.strip() # --------------------------------------------- # Line Tokenize if results != '': line_tokenizer = LineTokenizer() results = line_tokenizer.tokenize(results) results = list(filter(None, results)) results = results[0] # --------------------------------------------- # Tab Tokenize if results != '': tab_tokenizer = TabTokenizer() results = tab_tokenizer.tokenize(results) results = list(filter(None, results)) results = results[0] # --------------------------------------------- # Space Tokenizer if results != '': space_toknizer = SpaceTokenizer() results = space_toknizer.tokenize(results) results = list(filter(None, results)) results = ' '.join([w for w in results]) # ----------------------------------------------- # Lemmatization using NLTK if results != '': lemmatizer_of_text = WordNetLemmatizer() word_list = word_tokenize(results) results = ' '.join([ lemmatizer_of_text.lemmatize(w, get_word_net_pos_tag(w)) for w in word_list ]) # --------------------------------------------- # Stemming using NLTK if results != '': stemmer = PorterStemmer() if type(results) == list: results = ' '.join(str(w) for w in results) results = word_tokenize(str(results)) results = [stemmer.stem(word) for word in results] results = ' '.join(str(w) for w in results) # --------------------------------------------- # Remove Stop Words if stop_word & (results != ''): nltk.download('stopwords') stop_words = set(stopwords.words(stop_word_language)) word_tokens = word_tokenize(results) results = ' '.join(str(w) for w in word_tokens if not w in stop_words) # --------------------------------------------- # Chunking of the input, will be used ofr coloring of the text if results != '': result_str = TextBlob(results) reg_exp = 'NP: { < DT >? < JJ > * < NN >}' rp = nltk.RegexpParser(reg_exp) results_chunk = rp.parse(result_str.tags) # results_chunk.draw() # --------------------------------------------- # Named Entity Recognition if results != '': results_named_entitiy = ne_chunk(pos_tag(word_tokenize(results))) return results, results_chunk, results_named_entitiy
print tokenizer.tokenize(text) print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+') # Tokenize whitespace tokenizer = RegexpTokenizer('\s+', gaps=True) print tokenizer.tokenize(text) # Select only words starting with capital letters capt = RegexpTokenizer('[A-Z]\w+') print capt.tokenize(text2) print BlanklineTokenizer().tokenize(text2) print WhitespaceTokenizer().tokenize(text2) print LineTokenizer(blanklines='keep').tokenize(text2) print LineTokenizer(blanklines='discard').tokenize(text2) # SpaceTokenizer works similar to .split('') print SpaceTokenizer().tokenize(text2) # Returns the sequence of tuples that are offsets of the tokens # in a sentence: print list(WhitespaceTokenizer().span_tokenize(text2)) # Returns the sequence of relative spans print list(spans_to_relative(WhitespaceTokenizer().span_tokenize(text2))) # Returns the offsets of tokens in text2 by splitting at each incidence of the separator: print list(string_span_tokenize(text2, ""))
import nltk sent = " She secured 90.56 % in class X \n. She is a meritorious student\n" from nltk.tokenize import SpaceTokenizer print(SpaceTokenizer().tokenize(sent))
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
def get_train_data(self, MAX_LENGTH=60, full_neg=True): ## MAX_LENGTH: max length of segments to be split into ## neg ratio: how many neg data to use (out of 100), should be an integer ## full_neg: whether to extract all neg data max_length_token = MAX_LENGTH ## avoid taking docs from test set test_doc_ids = [] zero_shot_doc_ids = [] with open('../data/all_test_docs/test_doc_ids') as f: fl = f.readlines() test_doc_ids = [int(line.strip()) for line in fl] with open('../data/all_test_docs/zero_shot_doc_ids') as f: fl = f.readlines() zero_shot_doc_ids = [int(line.strip()) for line in fl] train_doc_len = len( set(self.data_set_citations['publication_id'].values)) - len( test_doc_ids) - len(zero_shot_doc_ids) logger.info('sample from ' + str(train_doc_len) + ' train docs') pos_count = 0 neg_count = 0 pos_tokens = 0 neg_tokens = 0 sampled = [] with codecs.open(self.outdir + 'pos_data', 'w') as pos_data, codecs.open( self.outdir + 'neg_data', 'w') as neg_data: for index, row in self.data_set_citations.iterrows(): pub_id = row['publication_id'] if pub_id in zero_shot_doc_ids or pub_id in test_doc_ids: continue if pub_id in sampled: continue else: sampled.append(pub_id) pub_ids = list(self.data_set_citations['publication_id']) rows = [pub_ids.index(i) for i in pub_ids if i == pub_id] mention_list = [] for r in rows: d_row = self.data_set_citations.loc[r] mention_list.extend(d_row['mention_list']) mention_list = set(mention_list) logger.info('pub id: {}, mentions: {}'.format( pub_id, len(mention_list))) sample_text = self.full_text[str(pub_id) + '.txt'] sample_text_tokens = list( SpaceTokenizer().tokenize(sample_text)) sample_text_spans = list( SpaceTokenizer().span_tokenize(sample_text)) pos_splits = [] for mention_text in mention_list: mention_text = re.sub('\d', ' ', mention_text) # mention_text = re.sub('[^ ]- ', '', mention_text) mention_text_spans = list( SpaceTokenizer().span_tokenize(mention_text)) index_finder_lower = findall_lower(mention_text, sample_text) all_found_indices = [idx for idx in index_finder_lower] for find_index in all_found_indices: try: if find_index != -1: # logger.info('Found: '+mention_text) new_mention_text_spans = [ (indices[0] + find_index, indices[1] + find_index) for indices in mention_text_spans ] #write to training sample pointers here for splits in range( len(sample_text_tokens) // max_length_token - 1): if sample_text_spans.index(new_mention_text_spans[0]) > splits*(max_length_token) and \ sample_text_spans.index(new_mention_text_spans[-1]) < (splits+1)*(max_length_token): pos_splits.append(splits) pos_count += 1 pos_tokens += len( new_mention_text_spans) neg_tokens += ( MAX_LENGTH - len(new_mention_text_spans)) #TODO Wrapper over full data reader pos_data.write( str( sample_text_spans.index( new_mention_text_spans[0]) - splits * (max_length_token)) + ' ' + str( sample_text_spans.index( new_mention_text_spans[-1]) - splits * (max_length_token)) + ' ' + str(row['data_set_id']) + ' ' + str(row['publication_id']) + ' ' + ' '.join(sample_text_tokens[ splits * (max_length_token): (splits + 1) * (max_length_token) + 1]) + '\n') else: # print ('Annotation Error: Annotated gold standards not correct') pass except: # print ('Indexing Logic Error: Some corner index case missed while parsing') pass if not full_neg: continue ## NOTE: index starts from 0 ## -1 - 1 means no mention for splits in range( len(sample_text_tokens) // (max_length_token) - 1): if splits not in pos_splits: neg_data.write( str(-1) + ' ' + str(-1) + ' ' + str(0) + ' ' + str(row['publication_id']) + ' ' + ' '.join(sample_text_tokens[splits * (max_length_token): (splits + 1) * (max_length_token)]) + '\n') neg_count += 1 neg_tokens += MAX_LENGTH logger.info(str(pos_count) + " mentions added.") logger.info(str(neg_count) + " no mentions added.") logger.info(str(pos_tokens) + " pos tokens added.") logger.info(str(neg_tokens) + " neg tokens added.") logger.info("neg token percentage: {}".format( neg_tokens * 100 / (pos_tokens + neg_tokens)))
#print(question[1]) selected_k = [] for r in range(len(ranked_q)): pos = nltk.pos_tag(ranked_q[r]) selective_pos = ['NN', 'VB'] selective_pos_words = [] for word, tag in pos: if tag in selective_pos: selective_pos_words.append((word, tag)) selected_k.append(selective_pos_words) #print(selected_k[1][0]) # In[14]: from nltk.tokenize import SpaceTokenizer tm = SpaceTokenizer() to_rank = [] key_words = [] for i in range(len(ranked_q)): yn = 0 #ranked_q[i][yn] question[i] = untokenize(question[i]) yy = "_____" to_rank.append(tm.tokenize(ranked_q[i][0])) print("Q:", question[i].replace(to_rank[i][len(to_rank[i]) // 2], yy)) print('Ans - ', to_rank[i][len(to_rank[i]) // 2]) #quita = question[i].index(to_rank[i][len(to_rank[i])//2])
# -*- coding: utf-8 -*- from nltk.tokenize import SpaceTokenizer import sys s = sys.argv[1].decode('utf-8') dt = sys.argv[2] #print "dt = "+dt #python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ" ' ' tokens = [] if(dt == " "): tokens = SpaceTokenizer().tokenize(s) #python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \t ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ" '\t' from nltk.tokenize import TabTokenizer if(dt == '\\t'): print "dt = "+dt s = s.replace(u'\\t','\t') tokens = TabTokenizer().tokenize(s) #python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ" '\n' from nltk.tokenize import LineTokenizer if(dt == '\\n'): s = s.replace(u'\\n','\n')
X_out.append(getEmbeddingOfSequence(args, seq)) return X_out def getEmbeddingOfSequence(args, sequence): list = [] for i in sequence: if i in args.index2word: list.append(args.word2vec[args.index2word[i]]) else: list.append(args.word2vec['<UK>']) #print list return list stringTokenizer = SpaceTokenizer() '''Other features over relationphrase ''' def get_other_relationphrase_features(args, x_1, relationphrase_complete, relationphrase_middle, entity_1, entity_2): if len(relationphrase_complete) < 4: return x_1 #nltk.word_tokenize(sentence) relationphrase_complete_tokens = stringTokenizer.tokenize( relationphrase_complete) relationphrase_complete_postags = nltk.pos_tag(