def pos_titles_from(input_path, output_path = None, options = None): finput, foutput = get_streams(input_path, output_path) skip, end = get_options(options) tokenizer = Tokenizer() tagger = PerceptronTagger() line_counter = 0 skipped_lines = 0 for line in finput: log_advance(1000000, line_counter) line_counter += 1 if line_counter <= skip: continue if end and line_counter > end: break try: paper_id, title = get_fields(line) if is_english(title): print >> foutput, paper_id tokens = tokenizer.tokenize(title) for token in tagger.tag(tokens): print >> foutput, token[0], token[1] print >> foutput else: skipped_lines += 1 except: print >> sys.stderr, "Error:", line, sys.exc_info() log_nlines(line_counter, skipped_lines)
def find_ml(self, td): f_tokenizer = TreebankWordTokenizer() query_words = f_tokenizer.tokenize(td) genres = self.sentiment_analysis(query_words) weighted_genres = [] genre_weights = {} for x in genres: if x[1] is not None: weighted_genres.append(x[0]) genre_weights[x[0]] = x[1] d_score_updates = {} for movie in self.movies: g = self.genre_dict[movie][0] total_genre_score = 0 if u'Comedy' in g and 'comedy' in weighted_genres: total_genre_score += genre_weights['comedy'] if u'Action' in g and 'action' in weighted_genres: total_genre_score += genre_weights['action'] if u'Crime' in g and 'crime' in weighted_genres: total_genre_score += genre_weights['crime'] if u'Drama' in g and 'drana' in weighted_genres: total_genre_score += genre_weights['drama'] d_score_updates[self.movies.index(movie)] = total_genre_score * .1 return d_score_updates
def transformTweetData(tweet): content = unicode(tweet.sentence.lower(), errors='ignore') words = content.strip().split() tokenizer = TreebankWordTokenizer() extra_features = [] content = " ".join(words + extra_features) tokens = tokenizer.tokenize(content) tokens = [t for t in tokens if t not in stopwords] return tokens
def tokenize_en(text): """ Return a list of lists of the tokens in text, separated by sentences. """ sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer = TreebankWordTokenizer() sentences = [tokenizer.tokenize(sentence) for sentence in sent_tokenizer.tokenize(text)] return sentences
def pos_per_line(text_file): try: tokenizer = Tokenizer() #pos tagger = PerceptronTagger() for s in text_file: tokens = tokenizer.tokenize(s) #print " ".join([" ".join(token) for token in tagger.tag(tokens)]) print " ".join([token[1] for token in tagger.tag(tokens)]) except: print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
def getNoun(self, parser, sentence): #mysent = sentence.encode('ascii','ignore') #sent = mysent.decode() penn = TreebankWordTokenizer() tags = parser.tag(penn.tokenize(sentence)) the_tags = [] nouns = [] for t in tags: if t[1].startswith('NN'): nouns.append(t[0]) return ' '.join(nouns)
def genLexicon(data): tok = TreebankWordTokenizer() texts = [] for doc in data: for sent in doc: texts.append(tok.tokenize( sent[1].lower() )) dictionary = corpora.Dictionary(texts) pickle.dump(dictionary, open("lex/toy.lex", "w"))
def crear_dicc_doc_term(path): result = [] result_aux = [] file = open(path) for f in file: result.append(f) tokenizer = TreebankWordTokenizer() for s in result: tokenizer = RegexpTokenizer("[\w']+") temp = tokenizer.tokenize(s) words = temp result_aux += eiminar_stopwords(words) return result_aux
def section_02_02( datDIR ): print("\n### ~~~~~ Section 02.02 ~~~~~~~~"); ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### textfile = os.path.join( datDIR , "the-great-gatsby.txt" ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### with open(file = textfile, mode = 'r') as inF: sentences = [] for i, tempLine in enumerate(inF): if i > 100: break tempLine = tempLine.strip() sentences.append(tempLine) print( "%5d: %s" % (i,tempLine) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mySentence = sentences[20] + " " + sentences[21] print("\nmySentence:") print( mySentence ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #tokens = mySentence.split("([-\s.,;!?])+") tokens = re.split("([-\s.,;!?])+",mySentence) temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens)) print("\ntemp") print( temp ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myPattern = re.compile("([-\s.,;!?])+") tokens = myPattern.split(mySentence) print("\ntokens[-10:]") print( tokens[-10:] ) temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens)) print("\ntemp") print( temp ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myRegexpTokenizer = RegexpTokenizer("\w+|$[0-9.]+|\S+") print("\nmyRegexpTokenizer.tokenize(mySentence):") print( myRegexpTokenizer.tokenize(mySentence) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myTreebankWordTokenizer = TreebankWordTokenizer() print("\nmyTreebankWordTokenizer.tokenize(mySentence):") print( myTreebankWordTokenizer.tokenize(mySentence) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def word_tokenizePT(self, text, tokenizer): """ tokenize a portuguese sentence in words @input params: sentence - a sentence, a phrase (self) tokenizer - "TB" for TreebankWordTokenizer "WP" for WordPunctTokenizer @returns word's list or error """ if tokenizer == "TB": tokenizerTB = TreebankWordTokenizer() return tokenizerTB.tokenize(text) elif tokenizer == "WP": tokenizerWP = WordPunctTokenizer() return tokenizerWP.tokenize(text) else: return "tokenizer error: not found"
def __init__(self): self.tokenizer = TreebankWordTokenizer() self.word_pattern = re.compile(r"^([\w.]*)(\.)(\w*)$") self.proper_noun = re.compile(r"([A-Z]\.){2,}$") f = open(get_wpath("transition_words"), "r", encoding="utf8") transition_word = f.readline() self.words = r"([.,!?;:])\ *" + transition_word f.close() training_sents = nltk.corpus.treebank_raw.sents() tokens = [] boundaries = set() offset = 0 for sent in training_sents: tokens.extend(sent) offset += len(sent) boundaries.add(offset-1) # Create training features featuresets = [(self.punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens)-1) if tokens[i] in '.?!'] train_set = featuresets self.classifier = nltk.NaiveBayesClassifier.train(train_set)
def get_data(): glove = get_glove() tokenizer = TreebankWordTokenizer().tokenize text_field = Field(sequential=True, tokenize=tokenizer, include_lengths=True, lower=True, use_vocab=True) label_field = Field(sequential=False, pad_token=None, unk_token=None, is_target=True, use_vocab=True) with Timer('snli') as timer: print('snli{') splits = get_snli(text_field, label_field) print('}') text_field.build_vocab(*splits, vectors=glove) label_field.build_vocab(*splits) text_vocab = text_field.vocab label_vocab = label_field.vocab text_embeds = get_embeds(text_vocab.vectors) # snli = [pick_samples(ds, n=100) for ds in splits] # TODO: comment snli = splits return (snli, text_field, label_vocab, text_embeds)
def predict_with_parser(cls, options): if options.input_format == "standard": data_test = cls.DataType.from_file(options.conll_test, False) elif options.input_format == "space": with smart_open(options.conll_test) as f: data_test = [cls.DataType.from_words_and_postags([(word, "X") for word in line.strip().split(" ")]) for line in f] elif options.input_format == "english": from nltk import download, sent_tokenize from nltk.tokenize import TreebankWordTokenizer download("punkt") with smart_open(options.conll_test) as f: raw_sents = sent_tokenize(f.read().strip()) tokenized_sents = TreebankWordTokenizer().tokenize_sents(raw_sents) data_test = [cls.DataType.from_words_and_postags([(token, "X") for token in sent]) for sent in tokenized_sents] elif options.input_format == "tokenlist": with smart_open(options.conll_test) as f: items = eval(f.read()) data_test = cls.DataType.from_words_and_postags(items) else: raise ValueError("invalid format option") logger.info('Initializing...') parser = cls.load(options.model, options) ts = time.time() cls.predict_and_output(parser, options, data_test, options.out_file) te = time.time() logger.info('Finished predicting and writing test. %.2f seconds.', te - ts)
class TreebankWordTokenizerWrapper: """ Seriously I don't know why we need this class - this makes no sense """ PAT_NLTK_BUG = re.compile(r"^(?:(.+)(,|'s))$") def __init__(self): self.word_tokenizer = TreebankWordTokenizer() def tokenize(self, s): temp = self.word_tokenizer.tokenize(s) if temp: it = [] for t0 in temp: t = [t0] while True: m = self.PAT_NLTK_BUG.search(t[0]) if m: t.insert(0, m.group(1)) t[1] = m.group(2) else: break it += t #sys.stderr.write('DEBUG: t=%s => %s\n' % (t0, t)) else: it = temp return it
class CRCleaner(Cleaner): def __init__(self, input_dir, output_dir): super(CRCleaner,self).__init__(input_dir, output_dir, u"-\n'", punctuation+digits) self.t = TreebankWordTokenizer() def cleaned_text(self, text): if len(text) == 0: return u"" sans_xml = self.xml_to_txt(text) arr = self.t.tokenize(sans_xml) return self.reconstruct_arr(arr) def xml_to_txt(self, xml): arr = [] dom = parseString(xml) for node in (dom.firstChild.getElementsByTagName('speaking')+dom.firstChild.getElementsByTagName('speaking-unknown-id')): paragraphs = node.getElementsByTagName('paragraph') if len(paragraphs) > 0: for node2 in paragraphs: if node2.hasChildNodes(): child = node2.firstChild if child.nodeType == child.TEXT_NODE: arr += [child.data.replace(' ',' ')] return ' '.join(arr) def new_filename(self, old_filename): return old_filename.replace('.xml', '.txt')
def tf_normalized(full_texts): tokenizer = Tokenizer() tf = {} max_value = 0 for text in full_texts: text_tokens = tokenizer.tokenize(text) text_tokens = escape_not_abbreviations(text_tokens) for token in text_tokens: token = token.lower() tf.setdefault(token, 0.0) tf[token] += 1.0 if tf[token] > max_value: max_value = tf[token] for t in tf: tf[t] = tf[t]/max_value return tf
def compute_similarity(j, query, tf, idf, doc_norm, review_idx_mapping, neighborhood): """Calculates similarity score bewteen query and each review. Returns a list of review objects with similarity score attached""" if query == "": new_reviews = [] for review in j["reviews"]: new_review = review new_review["sim_score"] = 1 new_reviews.append(new_review) return new_reviews tokenizer = TreebankWordTokenizer() doc_scores = np.zeros(len(doc_norm)) # Initialize D query = query.lower() tokenized_query = tokenizer.tokenize(query) counter = Counter(tokenized_query) counter = { token: count for (token, count) in counter.items() if token in idf } query_token_to_idx = { token: idx for idx, (token, _) in enumerate(counter.items()) } for token, count in counter.items(): cur_token_idx = query_token_to_idx[token] q_tfidf = count * idf[token] # Construct q for doc_id, freq in tf[token]: doc_scores[doc_id] += q_tfidf * freq * idf[token] # Construct D for idx in range(len(doc_norm)): doc_scores[idx] = doc_scores[idx] / (doc_norm[idx] + 1) neighborhood = neighborhood.lower() output = [(review_idx_mapping[neighborhood][i], doc_scores[i]) for i in range(len(doc_scores))] new_reviews = [] for idx, score in output: new_review = j["reviews"][idx] new_review["sim_score"] = score new_reviews.append(new_review) return new_reviews
def filtrer1(ennonce): from nltk.tokenize import TreebankWordTokenizer from nltk.corpus import stopwords # On instancie notre tokenizer tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(ennonce) # chargement des stopwords français french_stopwords = set(stopwords.words('french')) # un petit filtre tokens = [ token for token in tokens if token.lower() not in french_stopwords ] filtrat = [] for element in tokens: filtrat.append(element) return (filtrat)
def q04_count_vectors(path,ranges=(1,2),max_df=0.5,min_df=2): data,X_train,X_test,y_train,y_test=q01_load_data(path) tokenizer1=TreebankWordTokenizer() tf=CountVectorizer(decode_error='ignore',tokenizer=tokenizer1.tokenize,ngram_range=ranges,max_df=max_df, min_df=min_df,stop_words='english') tf.fit(X_train) variable1=tf.transform(X_train) variable2=tf.transform(X_test) return variable1,variable2
def pennTreeBank(self, text): tokenizedText = [] for s in text: s=s.lower() tokenizedText.append(TreebankWordTokenizer().tokenize(s)) return tokenizedText
def __init__(self, filename): self.filename = filename self.tokenizer = TreebankWordTokenizer() self.sent_tokenizer = load( 'tokenizers/punkt/{0}.pickle'.format('english')) self.st = StanfordPOSTagger( '../stanfordPOStagger/english-bidirectional-distsim.tagger', '../stanfordPOStagger/stanford-postagger.jar', java_options='-mx2048m') #self.w2v_model = KeyedVectors.load_word2vec_format( # "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz", # binary=True) self.w2v_model = None self.text = self.get_text() self.anns = [] self.idx_list = IdxList() self.punct = punctuation + '‘’— \t\n'
def get_if(example): q_toks = example.q2_toks # lower after tokenising as case info is useful for marker in CONDITIONAL_MARKERS: marker_toks = TreebankWordTokenizer().tokenize(marker) if find_sublist(marker_toks, q_toks) > 0: return True return False
def sep_cue(example): q_toks = example.q2_toks # lower after tokenising as case info is useful for marker in SEPARABLE_MARKERS: marker_toks = TreebankWordTokenizer().tokenize(marker) if find_sublist(marker_toks, q_toks) > 0: return True return False
def __init__( self, word_tokenizer=TreebankWordTokenizer(), sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/turkish.pickle'), **kwargs): self._seq = MongoDBLazySequence(**kwargs) self._word_tokenize = word_tokenizer.tokenize self._sent_tokenize = sent_tokenizer.tokenize
def tokenize_and_vectorize(dataset): tokenizer = TreebankWordTokenizer() vectorized_data = [] expected = [] for sample in dataset: tokens = tokenizer.tokenize(sample[1]) sample_vecs = [] for token in tokens: try: sample_vecs.append(word_vectors[token]) except KeyError: pass # No matching token in the Google w2v vocab vectorized_data.append(sample_vecs) return vectorized_data
def _compute_unigram_frequency(self): wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*') tokenizer = TreebankWordTokenizer() total = len(wordlists.fileids()) count = 0 fdist = nltk.FreqDist() for fl in wordlists.fileids(): count += 1 fl_abs_path = os.path.join(self.prepared_training_data_root, fl) with open(fl_abs_path, 'r') as f: words = tokenizer.tokenize(f.read()) fdist.update(words) print 'freqdist: %s of %s' % (count, total) with open(os.path.join(self.corpus_root, 'unigram_frequency.txt'), 'w') as f: f.writelines(['%s %s\n' % (word, freq) for (word, freq) in fdist.items()]) return None
def lemma_tokenizer(text): stop_words = stopwords.words("english") tokens = TreebankWordTokenizer().tokenize(text) tokens = [word.lower() for word in tokens if word.isalpha()] filtered_words = [word for word in tokens if word not in stop_words] lemmatizer = WordNetLemmatizer() lemmas = [lemmatizer.lemmatize(word) for word in filtered_words] return lemmas
def tokeniseForDistance(sentence): """ Function to return tokens from a sentence """ punc = list(string.punctuation) tokens = TreebankWordTokenizer().tokenize(sentence) #tokens = [token for token in tokens if token not in punc] return tokens
def tokenize(self, list_text, tokenizer=None): if not list_text: return None if not isinstance(list_text, list): raise ValueError("Please input a list of string for tokenization!") self.list_text = list_text if not tokenizer: self.raw_tokens = [text.split() for text in list_text] elif "treebank" in tokenizer.lower(): t = TreebankWordTokenizer() self.raw_tokens = [t.tokenize(text) for text in list_text] elif "toktok" in tokenizer.lower(): t = ToktokTokenizer() self.raw_tokens = [t.tokenize(text) for text in list_text] if not self.raw_tokens: return None
def polar_q1(example): q_toks = example.q1_toks # lower after tokenising as case info is useful for marker in POLAR_MARKERS: marker_toks = TreebankWordTokenizer().tokenize(marker) if find_sublist(marker_toks, q_toks) > 0: return True return False
def ArTokenizer(text, token_min_len=2, token_max_len=15, lower=False): tokens = TreebankWordTokenizer().tokenize( accents.sub('', puncs.sub(' ', text))) # keep only Ar words between min/max len and remove other characters if any return [ nonArabic.sub('', token) for token in tokens if arabic.findall(token) and token_min_len <= len(token) <= token_max_len ]
def vp_ell_q2(example): q_toks = example.q2_toks # lower after tokenising as case info is useful for marker in VERB_ELLIPSIS_MARKERS: marker_toks = TreebankWordTokenizer().tokenize(marker) if find_sublist(marker_toks, q_toks) > 0: return True return False
def run(self): for i in range(int(self.lo), int(self.hi)): data = urlopen(str(url[i])) mybytes = data.read().decode('windows-1252').lower() tokenizer = TreebankWordTokenizer() line = re.sub( '[i?.,\',;:/\"<>\\%@#+-_&^$=()…—“”’*»’.``!¿\'`"’ï–]', '', mybytes) arrayWord = tokenizer.tokenize(line) for j in range(len(arrayWord)): self.binary.put(arrayWord[j], 1, i) w = self.hashTable.find(arrayWord[j]) if (w != None): self.hashTable.insert(arrayWord[j], w + 1, i) else: self.hashTable.insert(arrayWord[j], 1, i) self.sequence.append(Data(i + 1, arrayWord[j], j))
def __init__(self): self.tokenizer = TreebankWordTokenizer() # remove % and @ from the4th list as compared to original PUNCTUATION: self.tokenizer.PUNCTUATION = [ (re.compile(r'([:,])([^\d])'), r' \1 \2'), # ABN: added to handle non-pronunceable dashes, like Súes-skurðinn' # keep dashes after digits and ordinals, and SNAV (directions). Add 'a-ö'? (re.compile(r'([^\.\d[A-ZÞÆÖÁÉÍÓÚÝÐ])([-])'), r'\1 '), (re.compile(r'([:,])$'), r' \1 '), (re.compile(r'\.\.\.'), r' ... '), (re.compile(r'[;#$&]'), r' \g<0> '), # Handles the final period. # #ABN: changed this to deal with ordinals at the end of sentence: [^\.] -> [^\.\d], don't detach '.' after a digit. (Might be too general) (re.compile(r'([^\.\d])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '), (re.compile(r'[?!]'), r' \g<0> '), (re.compile(r"([^'])' "), r"\1 ' "), ]
def wh_q2(example): q_toks = example.q2_toks # lower after tokenising as case info is useful for marker in WH_WORDS: marker_toks = TreebankWordTokenizer().tokenize(marker) if find_sublist(marker_toks, q_toks) > 0: return True return False
def filtrer_ennonce(ennonce): from nltk.tokenize import TreebankWordTokenizer from nltk.corpus import stopwords # On instancie notre tokenizer tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(ennonce) # chargement des stopwords français french_stopwords = set(stopwords.words('french')) # un petit filtre tokens = [ token for token in tokens if token.lower() not in french_stopwords ] print(tokens)
def __prepare__(self): """ """ conversations = open(path.join(self.BASE_PATH, self.CONVS_FILE), 'r').readlines() movie_lines = open(path.join(self.BASE_PATH, self.LINES_FILE), 'r').readlines() tbt = TreebankWordTokenizer().tokenize self.words_set = set() self.lines_dict = {} for i, line in enumerate(movie_lines): parts = map(lambda x: x.strip(), line.lower().split(self.FILE_SEP)) tokens = tbt(parts[-1]) self.lines_dict[parts[0]] = tokens self.words_set |= set(tokens) self.word2idx = {} self.word2idx[self.PAD_TOKEN] = 0 self.word2idx[self.EOS_TOKEN] = 1 self.word2idx[self.GO_TOKEN] = 2 for i, word in enumerate(self.words_set): self.word2idx[word] = i + 3 self.idx2word = [0] * len(self.word2idx) for w, i in self.word2idx.items(): self.idx2word[i] = w # extract pairs of lines in a conversation (s0, s1, s2) -> {(s0, s1), (s1, s2)} utt_pairs = [] for line in conversations: parts = map( lambda x: x[1:-1], map(lambda x: x.strip(), line.lower().split(self.FILE_SEP))[-1][1:-1].split(', ')) utt_pairs += list(pairwise(parts)) utt_pairs = np.random.permutation(utt_pairs) train_utt_pairs = utt_pairs[self.VAL_COUNT:] self.val_pairs = utt_pairs[:self.VAL_COUNT] def find_bucket(enc_size, dec_size, buckets): return next( dropwhile(lambda x: enc_size > x[0] or dec_size > x[1], buckets), None) for pair in train_utt_pairs: bckt = find_bucket(len(self.lines_dict[pair[0]]), len(self.lines_dict[pair[1]]), self.bucket_sizes) if bckt is None: self.bucket_pairs[(-1, -1)].append(pair) else: self.bucket_pairs[bckt].append(pair) self.bucket_ordering = [] for bckt, _ in sorted(map(lambda x: (x[0], len(x[1])), self.bucket_pairs.items()), key=lambda x: x[1], reverse=True): self.bucket_ordering.append(bckt)
def _compute_biagram_frequency(self): if not os.path.exists(self.bigram_frequency_dir): os.mkdir(self.bigram_frequency_dir) wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*') tokenizer = TreebankWordTokenizer() total = len(wordlists.fileids()) count = 0 for fl in wordlists.fileids(): count += 1 print 'freqdist: %s of %s' % (count, total) fl_abs_path = os.path.join(self.prepared_training_data_root, fl) with open(fl_abs_path, 'r') as f: words = tokenizer.tokenize(f.read()) bi_words = nltk.bigrams(words) fdist = nltk.FreqDist(bi_words) with open(os.path.join(self.bigram_frequency_dir, fl), 'w') as f: f.writelines(['%s %s %s\n' % (word[0], word[1], freq) for (word, freq) in fdist.items()]) return None
def testTreebankTokenizer(self): tokenizer = IndexedTokenizer(TreebankWordTokenizer()) string = " Facing the long wall in front of you, your destination will be the first door to your left (36-880)." tokens = tokenizer.tokenize(string) self.assertEqual([t.text for t in tokens], ['Facing', 'the', 'long', 'wall', 'in', 'front', 'of', 'you', ',', 'your', 'destination', 'will', 'be', 'the', 'first', 'door', 'to', 'your', 'left', '(', '36-880', ')', '.']) for i, token in enumerate(tokens): self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def eiminar_stopwords(words): a = open('english.txt') result = [] english_stops = [] for f in a: result.append(f) tokenizer = TreebankWordTokenizer() for s in result: tokenizer = RegexpTokenizer("[\w']+") temp = tokenizer.tokenize(s) english_stops += temp resultado = [] from nltk.stem import PorterStemmer stemmer = PorterStemmer() for w in words: if not w in english_stops: resultado.append(stemmer.stem(w)) return resultado
def read_data(filename): with open(filename, encoding='utf-8') as f: data = tf.compat.as_str(f.read()) data = data.lower() data = text_parse(data) data = TreebankWordTokenizer().tokenize(data) # The Penn Treebank return data
def stopWords (chaine) : tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(chaine) # chargement des stopwords français french_stopwords = set(stopwords.words('french')) # un petit filtre tokens = [token for token in tokens if token.lower() not in french_stopwords] counts = Counter(tokens) counts=counts.most_common(50) dico={} tabDico=[] for i in range(0,len(counts)): dico['text'] = counts[i][0] dico['size'] = counts[i][1] dico['href'] = "onclick/"+counts[i][0] tabDico.append(dico) dico={} return tabDico
class TreeBankWordTokenizerWrapper(AbstractTokenizer): def __init__(self, do_lower_case: bool = False): self._tokenizer = TreebankWordTokenizer() self._do_lower_case = do_lower_case def tokenize_single(self, sentence: str): if self._do_lower_case: sentence = sentence.lower() return self._tokenizer.tokenize(sentence)
def sentiment_predict(new_sentence): new_sentence = TreebankWordTokenizer().tokenize(new_sentence) # 토큰화 #new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거 encoded = Tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩 print(encoded) pad_new = pad_sequences(encoded, maxlen=42) # 패딩 #print(pad_new) score = float(loaded_model.predict(pad_new)) # 예측 return score
def __init__(self, data_path): train_path = os.path.join(data_path, "train.txt") valid_path = os.path.join(data_path, "valid.txt") test_path = os.path.join(data_path, "test.txt") vocab_path = os.path.join(data_path, "vocab.pkl") self.tokenizer = TreebankWordTokenizer() if os.path.exists(vocab_path): self._load(vocab_path, train_path, valid_path, test_path) else: self._build_vocab(train_path, vocab_path) self.train_data = self._file_to_data(train_path) self.valid_data = self._file_to_data(valid_path) self.test_data = self._file_to_data(test_path) self.idx2word = {v: k for k, v in self.vocab.items()} self.vocab_size = len(self.vocab)
def text_fdist(text, min_occurence): from nltk.probability import FreqDist from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() #tokenise words: tokens = tokenizer.tokenize(text) #remove stopwords tokens = [ token.lower() for token in tokens if token.lower() not in stopwords_en ] fdist_in = FreqDist(tokens) #filter words with more than one occurence fdist = list(filter(lambda x: x[1] >= min_occurence, fdist_in.items())) return fdist
def tokenize_for_lda(article, tokenizer=TreebankWordTokenizer(), stopwords=stopwords, regex_pattern=nonword): article_tokens = [ tok for tok in tokenizer.tokenize(article) if (tok.lower() not in stopwords and not regex_pattern.search(tok)) ] return article_tokens
class DssgUnigramExtractor(object): """ An instance of this is used to obtain a list of unigrams, given a text. Usages: unigramExtractor = DssgUnigramExtractor() tokenList = unigramExtractor.extract("here is a text as a string") # ['text', 'string'] """ _cache = {} def __init__(self): self._tokenizer = TreebankWordTokenizer() self._stopwordSet = set(stopwords.words("english")) self._stemmer = PorterStemmer() def __repr__(self): return self.__class__.__name__ + "()" def extract(self, text): """ Given a text, return a list of unigram tokens. """ if text not in DssgUnigramExtractor._cache: text = ( text.replace("<", "<") .replace(">", ">") .replace(""", '"') .replace("&", "&") .replace(" ", " ") ) text = nltk.clean_html(text) tokens = self._tokenizer.tokenize(text) newTokens = [] for tok in tokens: # - lowercase, remove ' tok = tok.lower().strip("`'.,-_*/:;\\!@#$%^&*()=\"") # - remove stopwords, one character word, only numbers # - remove one character word # - remove only numbers if tok in self._stopwordSet or len(tok) <= 1 or isAllNumbers(tok): continue # - apply stemming # oldTok = copy.deepcopy(tok); # for debug tok = self._stemmer.stem(tok) # sometimes a token is like 'theres' and becomes stopword after # stemming if tok in self._stopwordSet: continue newTokens.append(tok) DssgUnigramExtractor._cache[text] = newTokens return DssgUnigramExtractor._cache[text]
def tokenize(text, stopword=False, punct=False, lower=False, stem=False, num=False, single=False, link=False): """ num: True, exclude numbers single: True, exclude single char todo: deal with unicode mafuckers """ token = [] tokenizer = TreebankWordTokenizer() token_temp = tokenizer.tokenize(text) for elt in token_temp: #temp = i.decode('unicode-escape') #temp = re.sub(ur'[\xc2-\xf4][\x80-\xbf]+', # lambda m: m.group(0).encode('latin1').decode('utf8'), temp) temp = unicode(elt) temp = unicodedata.normalize('NFKD', temp).encode('ascii', 'ignore') # get rid of empty strings #temp = i if temp: token.append(temp) token = [clean_front_end(word) for word in token if clean_front_end(word)] if lower: token = [word.lower() for word in token] if stem: token = [stemmer.stem(word) for word in token] if num: token = [word for word in token if not is_number(word)] if single: token = [word for word in token if len(word) > 1] if stopword: token = [word for word in token if word not in STOPWORD] if punct: token = [word for word in token if word not in PUNCT] if link: token = [word for word in token if not is_link(word)] #exclude empty strings token = [word for word in token if word] return token
def stopwords(filename): """A function that returns a dictionary with tokens as keys and counts of how many times each token appeared as values in the file with the given filename. Inputs: filename - the name of a plaintext file with a document on each line Outputs: A list of stopwords and a dictionary mapping tokens to counts. """ # We now track the number of times a word shows up (term frequency) and # the number of documents with a given word in it (document frequency) # separately. We use a Counter, which is exactly like a dictionary except # - the values can only be ints # - any key it hasn't seen yet is assumed to already have a value of 0 # This means we don't have to check whether we've used a key before when # we use the "+= 1" operation. term_frequency_dict = Counter() word_total = 0 tokenizer = TreebankWordTokenizer() with open(filename, 'r') as f: for line in f: words = tokenizer.tokenize(line.lower()) # For the programmer types: there are several more efficient # ways to write this section using dictionaries or sets. You're # welcome to rewrite this part to exercise that. for word in words: term_frequency_dict[word] += 1 word_total += 1 # A fun feature of Counters is that they have a built-in function that # gives you the n keys with the biggest values, or the "most common" # things being counted. We can use this to find the most common words. # This comes out as a list of pairs of key and value, like # [('foo', 10), ('bar', 7), ... , ('rare', 1)] stoplist_pairs = term_frequency_dict.most_common(100) stoplist = [word for (word, freq) in stoplist_pairs] return stoplist, term_frequency_dict, word_total
def sentences(self, lowercase=False, strip_punct=[], num_placeholder=None): word_tokenizer=TreebankWordTokenizer() sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle') token_sents = [word_tokenizer.tokenize(sent) for sent in sent_tokenizer.tokenize(self.response)] if lowercase: token_sents = [[token.lower() for token in sent] for sent in token_sents] if len(strip_punct) > 0: token_sents = [[token for token in sent if token not in strip_punct] for sent in token_sents] if num_placeholder is not None: def replace_num(token, placeholder): try: float(token.replace(',','')) return placeholder except ValueError: return token token_sents = [[replace_num(token, num_placeholder) for token in sent] for sent in token_sents] return token_sents
def test_treebank_span_tokenizer(self): """ Test TreebankWordTokenizer.span_tokenize function """ tokenizer = TreebankWordTokenizer() # Test case in the docstring test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)." expected = [ (0, 4), (5, 12), (13, 17), (18, 19), (19, 23), (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78) ] result = tokenizer.span_tokenize(test1) self.assertEqual(result, expected) # Test case with double quotation test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues" expected = [ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102), (103, 109) ] result = tokenizer.span_tokenize(test2) self.assertEqual(result, expected) # Test case with double qoutation as well as converted quotations test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues" expected = [ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27), (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64), (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96), (97, 99), (100, 106), (107, 113) ] result = tokenizer.span_tokenize(test3) self.assertEqual(result, expected)
class MorphyStemmer: def __init__(self): self.tokenizer = TreebankWordTokenizer() def __call__(self, doc): stemmed_doc = [] for t in self.tokenizer.tokenize(doc): stem = wordnet.morphy(t) if stem: stemmed_doc.append(stem.lower()) else: stemmed_doc.append(t.lower()) return stemmed_doc
def make_word_set(context): """ Computes the set of all words used in a list of strings. Arguments ========= context: a list of strings Returns ======= word_set: set of distinct words """ tokenizer = TreebankWordTokenizer() sw = stopwords.words('english') word_list = [] for string in context: tkns = tokenizer.tokenize(string) for tk in tkns: if tk not in sw: word_list.append(tk) word_set = set(word_list) return word_set
class Tokenizer(object): def __init__(self, language='english'): self.paragraph_tokenizer = nltk.data.load('tokenizers/punkt/%s.pickle' % language) self.sentence_tokenizer = TreebankWordTokenizer() self.english_stops = set(stopwords.words(language)) def tokenize(self, text, remove_stopwords=False): sentences = self.paragraph_tokenizer.tokenize(text) token = [] for sentence in sentences: words = self.sentence_tokenizer.tokenize(sentence) if remove_stopwords: token.append([word for word in words if word not in self.english_stops]) else: token.append(words) return token
def __init__(self, images_path, annotations_path, buckets, bucket_minibatch_sizes, word2idx, mean_im, shuffle=True): self.buckets = buckets self.word2idx = word2idx self.bucket_minibatch_sizes = bucket_minibatch_sizes self.buffer_size = 16 self.input_qsize = 64 self.min_input_qsize = 16 self.total_max = 0 self.mean_im = mean_im self.tokenizer = TreebankWordTokenizer() self.annotations_path = annotations_path self.images_path = images_path self.shuffle = shuffle self._initialize() self.queue = Queue.Queue() self.out_queue = Queue.Queue(maxsize=self.buffer_size) self._init_queues()
class nlp: def __init__(self): self.tb = tb self.porter = nltk.PorterStemmer() self.tk = TreebankWordTokenizer() self.stopwords = set(stopwords.words()) def tag(self,text): blob = self.tb(text) return blob.tags #clean是词干化和标点符号的 def noun(self,text,clean=True): text = text.replace('\\n',' ') text = text.replace('\\t',' ') blob = self.tb(text) tags = blob.tags result = [] for (aword,atag) in tags: if atag == "NNP" or atag == "NNS" or atag == "NN": result.append(aword.lower()) if clean == True: clean_result = [] for word in result: nword = porter.stem(remove_non_chap(word)) #nword = small_stem(remove_non_chap(word)) if len(nword) > 2: clean_result.append(nword) return clean_result return result #这个东西可能用着不太好,暂时先别用 def noun_p(self,text): blob = self.tb(text) return blob.noun_phrases def token(self,text): result,clean_result = self.tk.tokenize(text),[] for word in result: nword = word.lower() nword = small_stem(nword) if len(nword) <= 30: clean_result.append(nword) return ' '.join(clean_result)
def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []): """ Initialization. Args: mysql_con (PySQLPoolConnection): MySQL connection Object redis_con (StrictRedis): RedisDB connection Object tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. classifier (Object): scikit trained classifier to detect real and fake events points (list[dict]): raw messages from event detector """ self.mysql = mysql_con self.redis = redis_con if morph: self.morph = morph else: self.morph = MorphAnalyzer() if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = TreebankWordTokenizer() self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE) self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') self.validity = None self.verification = None self.cores = {} self.classifier = classifier if points: self.id = str(uuid4()) self.created = datetime.now() self.updated = datetime.now() self.messages = { x['id']:x for x in points } self.get_messages_data() self.media = {} self.get_media_data() self.event_update()
def __init__(self): self.sentim_analyzer = SentimentAnalyzer() self.genre_dict = read_file("jsons/movie_genre_quote_dict_2.json") context_file = "jsons/final_context.json" movie_file = "jsons/final_movies.json" quote_file = "jsons/final_quotes.json" year_rating_file = "jsons/final_year_rating.json" self.context = read_file(context_file) self.movies = read_file(movie_file) self.quotes = read_file(quote_file) self.year_rating_dict = read_file(year_rating_file) # Reincode to unicode for i in range(len(self.context)): self.context[i] = self.context[i].encode("utf-8").decode("utf-8") self.movies[i] = self.movies[i].encode("utf-8").decode("utf-8") self.quotes[i] = self.quotes[i].encode("utf-8").decode("utf-8") self.context, self.quotes, self.movies = quote_pruner(self.context, self.quotes, self.movies) self.inverted_index = read_file("jsons/f_inverted_index.json") self.idf = read_file("jsons/f_idf.json") # Initialize query tokenizer self.tokenizer = TreebankWordTokenizer() # Compute document norms self.norms = compute_doc_norms(self.inverted_index, self.idf, len(self.context)) word_co_filename = "jsons/word_co.json" word_count_filename = "jsons/word_count_dict.json" pmi_dict_filename = "jsons/pmi_dict.json" # Read files self.word_co = read_file(word_co_filename) self.word_count_dict = read_file(word_count_filename) self.pmi_dict = read_file(pmi_dict_filename)
from csv import writer from datetime import datetime from nltk.corpus import stopwords from nltk.data import load from nltk.corpus import sentiwordnet from nltk.corpus import wordnet from nltk.tag import pos_tag from nltk.tokenize import TreebankWordTokenizer import ujson wordsTokenizer = TreebankWordTokenizer() stopWords = set(stopwords.words('english')) sentencesTokenizer = load('tokenizers/punkt/english.pickle') arquivoClassificados = open('classificados.json') classificados = ujson.load(arquivoClassificados) arquivoClassificados.close() acertos = 0 sentimentos = {} comeco = datetime.now() for resposta in classificados: texto = resposta['corpo'] frases = sentencesTokenizer.tokenize(texto) palavras = [] for frase in frases: palavrasTemp = wordsTokenizer.tokenize(frase) palavras.extend([palavra for palavra in palavrasTemp if palavra not in stopWords]) posTags = pos_tag(palavras) positivo = 0 negativo = 0 for palavra, tag in posTags: synsets = None if tag.startswith('J'):