def preprocess_text(list_text): ''' Function to preprocess documents. Preprocessing includes removing symbols in replace_with_space, removing any punctuation and removing stopwords. Return a 2-d list of preprocessed text ''' #variable to be used for text preprocessing function preprocess_text replace_with_space = re.compile('[/(){}\[\]\|@,;]') symbols_to_remove = re.compile("[^a-z _]+") stop_words = set(stopwords.words('english')) added_stopwords = ['one', 'says', 'like', 'said', 'say', 'would', 'go'] stop_words = set(list(stop_words) + added_stopwords) #list where preprocessed text will be stored. preprocessed_text = [] tknzr = TreebankWordTokenizer() lmtzr = WordNetLemmatizer() #stemmer = PorterStemmer() for sentence in list_text: text = sentence.lower() text = re.sub(replace_with_space, " ", text) text_tokens = tknzr.tokenize(text) text_tokens = [ token for token in text_tokens if token not in stop_words ] text_tokens = [lmtzr.lemmatize(token) for token in text_tokens] text = nltk.tokenize.treebank.TreebankWordDetokenizer().detokenize( text_tokens) text = re.sub(symbols_to_remove, "", text) text_tokens = tknzr.tokenize(text) preprocessed_text.append(text_tokens) return preprocessed_text
def __init__(self, rules=True): self.punct = set(string.punctuation).difference(set('%=')) self.rules = rules self.splitters = re.compile("[-/.,|<>]") self.tokenizer = TreebankWordTokenizer()
def tokenize(line): global tokenizer if args.skip_tokenization: return line if args.ptb: if tokenizer is None: tokenizer = TreebankWordTokenizer() return tokenizer.tokenize(line, convert_parentheses=True) return word_tokenize(line, language=args.language)
def __init__(self): self.letters_mappings = { u"á" : "a", u"é" : "e", u"í" : "i", u"ó" : "o", u"ú" : "u", u"ñ" : "n", u"ü" : "u" } self.tokenizer = TreebankWordTokenizer()
def test_word_tokenize_quotes(self): text = '"сл"' tokenizer = TreebankWordTokenizer() # _spans = nltk.word_tokenize(text) _spans = tokenizer.tokenize(text) spans = [s for s in _spans] print("".join(spans)) for c in spans: print(len(c)) self.assertEqual(3, len(spans))
def __init__(self): """Constructor.""" super().__init__() self.__treebank_tokenizer = TreebankWordTokenizer() hunspell_dict_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'hindi-hunspell', 'dict-hi_IN', ) if not os.path.isdir(hunspell_dict_dir): raise McLanguageException( "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')): raise McLanguageException( "Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')): raise McLanguageException( "Hunspell affix file does not exist at path: %s" % hunspell_dict_dir) try: self.__hindi_hunspell = Hunspell( lang='hi_IN', hunspell_data_dir=hunspell_dict_dir) except Exception as ex: raise McLanguageException( "Unable to initialize Hunspell with data directory '%s': %s" % ( hunspell_dict_dir, str(ex), )) # Quick self-test to make sure that Hunspell is installed and dictionary is available hunspell_exc_message = """ Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g. you might need to fetch Git submodules by running: git submodule update --init --recursive """ try: test_stems = self.stem_words(['गुरुओं']) except Exception as _: raise McLanguageException(hunspell_exc_message) else: if len(test_stems) == 0 or test_stems[0] != 'गुरु': raise McLanguageException(hunspell_exc_message)
def get_tokenizer(params: dict): model = params.get('tokenizer', '').lower() if model == 'punkt': return WordPunctTokenizer() if model != '' and model != 'treebank': raise ModuleNotFoundError(f'No such tokenizer {model}!') return TreebankWordTokenizer()
class Tokenizer: def __init__(self): self.tokenizer = TreebankWordTokenizer() def tokenize(self, sentence): tokens = self.tokenizer.tokenize(sentence) return tokens
def normalize(text): text = text.decode('utf-8') text = re.sub(r'[a-zA-z]+://[^\s]*', '', text) text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text) text = strip_accents_ascii(text) text = text.encode('utf-8') text = ' '.join(map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text))) return text
def iconize_corpus(args): """ This script retrives the sentences that contains at least one icon term - fdata is the current corpus - fembed is the icon embedding file """ # load embedding terms embdwrds = defaultdict(str) embdsyns = defaultdict(str) with open(args.fembd, encoding="utf-8") as f: for line in f: line = line.strip() terms = line.split() term = terms[0] code = terms[1] wtype = terms[2] if wtype == "main": embdwrds[term] = code else: embdsyns[term] = code # filter sentences that are oov w.r.t. the embedding tbt = TreebankWordTokenizer() plist = ["..", "...", "``", "''", "."] with open(args.fdata, encoding="utf-8") as f: for line in f: line = line.strip() sen = line.lower() sen = ''.join(i for i in sen if ord(i) < 123) sen = tbt.tokenize(sen) sen = [x for x in sen if not x in string.punctuation] sen = [x for x in sen if not x in plist] sentence = [] for word in sen: code = embdwrds[word] if code != "": sentence.append(code) elif embdsyns[word] != "": code = embdsyns[word] sentence.append(code) else: # comment for pure icon mode sentence.append(word) # pure icon mode sentence = str.join(" ", sentence) print(sentence)
def __init__(self, start_token: str, end_token: str, unk_token: str, num_words: int = None, max_seq_len: int = 100): self.treebank_word_tokenizer = TreebankWordTokenizer() improved_open_quote_regex = re.compile(u'([«“‘])', re.U) improved_close_quote_regex = re.compile(u'([»”’])', re.U) improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U) self.treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) self.treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) self.treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 ')) self.word_counts = OrderedDict() self.word_docs = {} self.num_words = num_words self.document_count = 0 self.START_TOKEN = start_token self.END_TOKEN = end_token self.UNK_TOKEN = unk_token self.MAX_SEQ_LEN = max_seq_len
def tree_bank_tokenizer(): tokenizer = TreebankWordTokenizer() tokenizer.PUNCTUATION.append((re.compile(r'[/\-]'), r' \g<0> ')) tokenizer.PUNCTUATION.append((re.compile(r'\.\.'), r' .. ')) tokenizer.PUNCTUATION.append((re.compile(r'[\.,\+]'), r' \g<0> ')) tokenizer.STARTING_QUOTES.append((re.compile( r"(')(?![sS]\s|[mM]\s|[dD]\s|ll\s|LL\s|re\s|RE\s|ve\s|VE\s|t\s|T\s|\s)" ), r" \1 ")) return tokenizer
def to_lower(self, item): tokenizer = TreebankWordTokenizer() for field in self.class_properties: current_field_value = getattr(item, field) setattr(item, field, [ w.lower() for w in self.tokenizer_text(current_field_value, tokenizer) ]) return item
def tokenize(s: str) -> list: """ Tokenize the given text using TreebankWordTokenizer delivered along with NLTK :param s: text :return: list of tokens """ from nltk import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(s) result = [] for word in tokens: # the last "decode" function is because of Python3 # http://stackoverflow.com/questions/2592764/what-does-a-b-prefix-before-a-python-string-mean w = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('utf-8').strip() # and add only if not empty (it happened in some data that there were empty tokens...) if w: result.append(w) return result
class TokenizePreprocessor(BaseEstimator, TransformerMixin): def __init__(self, rules=True): self.punct = set(string.punctuation).difference(set('%=')) self.rules = rules self.splitters = re.compile("[-/.,|<>]") self.tokenizer = TreebankWordTokenizer() def fit(self, X=None, y=None): return self @staticmethod def inverse_transform(X): return [", ".join(doc) for doc in X] def transform(self, X): return [self.token_representation(sentence) for sentence in X] def token_representation(self, sentence): return list(self.tokenize(sentence)) def tokenize(self, sentence): """break sentence into pos-tagged tokens; normalize and split on hyphens""" # extremely short sentences shall be ignored by next steps if len(sentence) < MIN_LEN: yield "_empty_sentence_" else: for token in self.tokenizer.tokenize(sentence): # Apply preprocessing to the token token_nrm = self.normalize_token(token) subtokens = [ self.normalize_token(t) for t in self.splitters.split(token_nrm) ] for subtoken in subtokens: # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue yield subtoken def normalize_token(self, token): # Apply preprocessing to the token token = token.lower().strip().strip('*').strip('.') if self.rules: token = map_regex_concepts(token) return token
def __init__(self): lexicon = Lexicon(TreebankWordTokenizer()) word2vec_name = 'word2vec/amazon.bin' vocab_size = 100000 word2vec = Word2VecManager(path.join(Constants.DATASETS, word2vec_name), vocab_size=vocab_size) source = EmbeddingVecSource(lexicon, word2vec) self.loader = ImdbDataLoader(source, root=path.join(Constants.DATASETS, 'amazon')) self.train = 'out' self.test = 'test'
def __init__(self): word2vec_name = 'word2vec/Imdb_min2.bin' vocab_size = 100000 lexicon = Lexicon(TreebankWordTokenizer()) word2vec = Word2VecManager(path.join(Constants.DATASETS, word2vec_name), vocab_size=vocab_size) source = EmbeddingVecSource(lexicon, word2vec) self.loader = ImdbDataLoader(source, root=path.join(Constants.DATASETS, 'aclImdb')) self.train = 'Mixed' self.test = 'All/Test'
def predict(self, text): spans = list(TreebankWordTokenizer().span_tokenize(text)) list_of_tokens = [ text[i:j] if text[i:j] in self.dataset.vec.word2idx else UNK for (i, j) in spans ] tokenized_spans = [None, *spans, None] list_of_tokens = [SOS, *list_of_tokens, EOS] sequences = list( map(lambda s: int(self.dataset.vec.word2idx[s]), list_of_tokens)) predictions, attentions, conicity_values = self.model.evaluate( [sequences]) predictions = np.array(predictions) return predictions[0], attentions[0], tokenized_spans, list_of_tokens
class TreebankSpanTokenizer(TreebankWordTokenizer): def __init__(self): self._word_tokenizer = TreebankWordTokenizer() def span_tokenize(self, text): ix = 0 for word_token in self.tokenize(text): ix = text.find(word_token, ix) end = ix + len(word_token) yield ix, end, word_token ix = end def tokenize(self, text): return self._word_tokenizer.tokenize(text)
def _tok_and_norm(corpus_path: str) -> List: """ Tokenizes and normalizes each line in the corpus, rejoins and then appends those lines to a larger list, titled 'tok_sents'. """ tok_sents = [] with open(corpus_path, 'r') as source: for line in tqdm(source): # normalize quotation marks/apostrophes for tokenization norm_toks = (line.replace("”", '"').replace("“", '"').replace( "’", "'").replace("‘", "'").replace("amp;", "")) # tokenizes each sentence and appends that tok_sent to the list tok_sents tok_sent = TreebankWordTokenizer().tokenize(norm_toks) tok_sents.append(tok_sent) return tok_sents
def preprocess(df): p_stemmer = PorterStemmer() tbt = TreebankWordTokenizer() custom_en_stop = ['want', 'go', 'hey', 'also', 'ok'] df = df.apply(lambda row: row.lower()) df = df.apply(lambda row: re.sub('{.+}', '', row)) df = df.apply(lambda row: re.sub("[0-9]{1,2} ?(am|pm)", "timeofday", row)) df = df.apply(lambda row: re.sub("[0-9]{1,2} ?(hours?|hrs?|mins?|minutes?)", "durationtext", row)) df = df.apply(lambda row: re.sub("[0-9]{10}\D", "phoneorpnr", row)) df = df.apply(lambda row: word_tokenize(row)) df = df.apply(lambda row: [WordNetLemmatizer().lemmatize(i) for i in row]) df = df.apply(lambda row: [i for i in row if i not in string.punctuation]) df = df.apply(lambda row: [i for i in row if i not in custom_en_stop]) df = df.apply(lambda x: ' '.join(x)) return df
def get_word_ids(query): con = clickhouse_driver.connect("clickhouse://127.0.0.1") ids = [] ps = PorterStemmer() for word_start, word_end in TreebankWordTokenizer().span_tokenize(query): word = query[word_start:word_end] stem = ps.stem(word) cur = con.cursor() cur.execute("SELECT id FROM words WHERE word = %(word)s", {"word": stem}) row = cur.fetchone() if row is None: print( f"Warning: Word {word} in form of {stem} not found in a database, skipping" ) else: id = row[0] ids.append(id) return ids
class NormalizationTokenization: def __init__(self): self.letters_mappings = { u"á" : "a", u"é" : "e", u"í" : "i", u"ó" : "o", u"ú" : "u", u"ñ" : "n", u"ü" : "u" } self.tokenizer = TreebankWordTokenizer() def letter_without_accent(self, letter): 'This method returns the version of a letter without accent' if letter in self.letters_mappings: return self.letters_mappings[letter] else: return letter def normalize(self, text): '''This method returns normalized version of the text. It removes all disallowed characters and makes the text lower case''' text = text.lower() mapIterator = map(lambda letter: self.letter_without_accent(letter), text) text = "".join(mapIterator) regex = r'[^a-zA-Z0-9\s\_\-\n]' text = re.sub(regex, '', text) return text def tokenize(self, text): 'This method returns the text diveded to tokens' return self.tokenizer.tokenize(text) def process_text(self, text): '''This method is the main method of this class. It processes the text and returns the result''' normalized_text = self.normalize(text) token_list = self.tokenize(normalized_text) return token_list
u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" u"|" # host name u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" # domain name u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" # TLD identifier u"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" u")" # port number u"(?::\d{2,5})?" # resource path u"(?:/\S*)?", re.UNICODE) tokenizer = TreebankWordTokenizer() stopword_set = set(stopwords.words("english")) punctuation_set = set(string.punctuation) stemmer = EnglishStemmer() def process_txt(txt, stem=True): words = [] txt_stripped = url_regex.sub("", txt) try: for sentence in sent_tokenize(txt_stripped): for w in tokenizer.tokenize(sentence): w_lower = w.lower() if w_lower not in stopword_set and w_lower not in punctuation_set:
class HindiLanguage(StopWordsFromFileMixIn): """Hindi language support module.""" __slots__ = [ # Stop words map '__stop_words_map', # Hunspell instance '__hindi_hunspell', # Word tokenizer '__treebank_tokenizer', ] def __init__(self): """Constructor.""" super().__init__() self.__treebank_tokenizer = TreebankWordTokenizer() hunspell_dict_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'hindi-hunspell', 'dict-hi_IN', ) if not os.path.isdir(hunspell_dict_dir): raise McLanguageException( "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir ) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')): raise McLanguageException("Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')): raise McLanguageException("Hunspell affix file does not exist at path: %s" % hunspell_dict_dir) try: self.__hindi_hunspell = Hunspell(lang='hi_IN', hunspell_data_dir=hunspell_dict_dir) except Exception as ex: raise McLanguageException( "Unable to initialize Hunspell with data directory '%s': %s" % (hunspell_dict_dir, str(ex),) ) # Quick self-test to make sure that Hunspell is installed and dictionary is available hunspell_exc_message = """ Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g. you might need to fetch Git submodules by running: git submodule update --init --recursive """ try: test_stems = self.stem_words(['गुरुओं']) except Exception as _: raise McLanguageException(hunspell_exc_message) else: if len(test_stems) == 0 or test_stems[0] != 'गुरु': raise McLanguageException(hunspell_exc_message) @staticmethod def language_code() -> str: return "hi" @staticmethod def sample_sentence() -> str: return ( "ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले " "विष्णुवतार भगवान श्रीराम, अयोध्या के महाराज दशरथ के बड़े सपुत्र थे।" ) def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: term_stems = self.__hindi_hunspell.stem(word) if len(term_stems) > 0: stem = term_stems[0] if stem is None or len(stem) == 0: log.debug("Stem for word '%s' is empty or None." % word) stem = word else: log.debug("Stem for word '%s' was not found." % word) stem = word stems.append(stem) if len(words) != len(stems): log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),)) return stems def split_text_to_sentences(self, text: str) -> List[str]: text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text is None.") return [] # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period text = text.replace("।", "।\n\n") # No non-breaking prefixes in Hausa, so using English file en = EnglishLanguage() return en.split_text_to_sentences(text) def split_sentence_to_words(self, sentence: str) -> List[str]: sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence is None.") return [] # Normalize apostrophe so that "it’s" and "it's" get treated identically sentence = sentence.replace("’", "'") # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period sentence = sentence.replace("।", ".") # TweetTokenizer / sentence_splitter don't work with Hindi for whatever reason, and word_tokenize() would # require NLTK data to be installed which is time consuming on Travis tokens = self.__treebank_tokenizer.tokenize(sentence) def is_word(token_: str) -> bool: """Returns True if token looks like a word.""" if re.match(pattern=r'\w', string=token_, flags=re.UNICODE): return True else: return False # TweetTokenizer leaves punctuation in-place tokens = [token for token in tokens if is_word(token)] return tokens
def create_data_list(self, filename_list): #return two lists, create id2word and id2ner mapping dicts data_list = [] ner_list = [] self.id2word = {} self.id2ner = {} ner_id = 1 word_id = 1 puncts= "()-,.?!:;*/--" for filename in filename_list: #split train and validation dataset if 'Test' in str(filename): split = 'test' else: split = random.choices(["train", "val"], weights = (80, 20), k = 1)[0] # split train into train #parse xml data tree = ET.parse(filename) root = tree.getroot() for elem in root: sent_id = elem.get("id") sentence = elem.get("text") text_tokens = TreebankWordTokenizer().tokenize(sentence) text_tokenized = [word.strip(puncts).lower() if word[-1] in puncts else word for word in text_tokens] text_tokenized = list(filter(None, text_tokenized)) span_text = list(TreebankWordTokenizer().span_tokenize(sentence)) # creat data list char_ids = [] for st in span_text: char_ids.append((st[0], (st[1]-1))) for i, token in enumerate(text_tokenized): if token.lower() not in self.id2word.values(): self.id2word[word_id] = token.lower() word_id += 1 for id, word in self.id2word.items(): if word == token.lower(): token_id = id word_info_list = (sent_id, token_id, int(char_ids[i][0]), int(char_ids[i][1]), split) data_list.append(word_info_list) # creat NER data list for sub_elem in elem: if sub_elem.tag == "entity": ner = sub_elem.get("type") if ner not in self.id2ner.values(): self.id2ner[ner_id] = ner ner_id += 1 for id, ner_tmp in self.id2ner.items(): if ner_tmp == ner: label = id #get char_start_id and char_end_id if ";" not in sub_elem.get("charOffset"): char_start, char_end = sub_elem.get("charOffset").split("-") char_start, char_end = int(char_start), int(char_end) ner_list.append([sent_id, label, char_start, char_end]) #if more than one mention of an entity, split into several lines else: occurences = sub_elem.get("charOffset").split(";") for occurence in occurences: char_start, char_end = occurence.split("-") char_start, char_end = int(char_start), int(char_end) ner_list.append([sent_id, label, char_start, char_end]) self.vocab = list(self.id2word.values()) return data_list, ner_list
import json import pickle as pk import numpy as np import random import os from tqdm import tqdm import re from glob import glob from nltk import TreebankWordTokenizer from string import punctuation from collections import Counter _tokenrize = TreebankWordTokenizer().tokenize _START_VOCAB = ["<unk>", "<pad>", "<stop>"] def format_data(js): data_list = js['data'] formated = [] for article in data_list: for passage in article['paragraphs']: context = passage['context'].strip() # unicode string for qa in passage['qas']: q = qa['question'] # unicode string # a = qa['answers'] # list of dicts answer = [(_['text'].strip(), int(_['answer_start'])) for _ in qa['answers']] answer = set(answer)
def recognize(self, text) -> Set[Annotation]: annotations = [] # We normalize the text (Remove all punctuation and replace with whitespace) normalized_input_text = self.punctuation_remove.sub(" ", text).replace( "-", " ").lower() # We split the text into token spans (begin and end position from the start of the text) spans = TreebankWordTokenizer().span_tokenize(normalized_input_text) token_spans = [i for i in spans] # we iterate over tokens one by one until we reach the end of the text current_token_span_index = 0 while current_token_span_index < len(token_spans): # we get the current token span currentSpan = token_spans[current_token_span_index] # we extract the string of the token from the text token = normalized_input_text[currentSpan[0]:currentSpan[1]] # if the word is a stoplist term or a termination term we skip it if token not in self.stop_words and token not in self.termination_terms: # We get the concept ids matching the phone of the current token token_phone = doublemetaphone(token)[0] concepts = self.concepts_from_phone(token_phone) # this is the start position of the first token of a matching sequence concept_start = currentSpan[0] # For now we have matched a single terms, so currently the end position will be that of the current # token concept_end = currentSpan[1] match_cursor = 1 stop_count = 0 while current_token_span_index + match_cursor < len( token_spans): # We get the next token and position span next_span = token_spans[current_token_span_index + match_cursor] next_token = normalized_input_text[ next_span[0]:next_span[1]] # if the token is in the termination list the matching process ends here if next_token in self.termination_terms: break # If the token is in the Stop list we skip it and increment the count of the skipped words # We will need to subtract this from the total number of tokens for the concept elif next_token in self.stop_words: stop_count += 1 # Otherwise we try to find a match for the token phone in the dictionary index else: # we doublemetaphone the token's text next_token_phone = doublemetaphone(next_token)[0] # We try to find matching concepts and compute the intersection with previously identified # concepts next_concepts = self.concepts_from_phone( next_token_phone) & concepts # if we find none we stop the matching here if len(next_concepts) == 0: break else: # if we find a match, then we update the current end position to that of the currently # matching token and update the intersected matched concept buffer concepts = next_concepts concept_end = next_span[1] # if we arrive here the current token has matched, we keep count of the current match length match_cursor += 1 # Once we get out of the loop we reconstruct the matches from the concepts remaining in the set # after successive intersections, if concepts is empty there was no match and so # Tokens.conceptsToAnnotationTokens will return an empty list otherwise we get a list of # AnnotationToken objects instances that we add to the list of identified concepts for concept in concepts: key_parts = concept.split(":::") concept_id = key_parts[0] annotation = Annotation( concept_id, concept_start, concept_end, text[concept_start:concept_end], match_cursor - stop_count, label_key=concept, concept=self.concept_index[concept]) annotations.append(annotation) current_token_span_index += 1 # Here we filter the annotations to keep only those where the concept length matches the length of the # identified annotation return set([ annotation for annotation in annotations if annotation.matched_length == self.concept_length_index[ annotation.label_key] ])
def __init__(self): self._word_tokenizer = TreebankWordTokenizer()
def tokenize(self, text): return TreebankWordTokenizer().tokenize(text)
import nltk.data from nltk import word_tokenize, TreebankWordTokenizer # usage app inputFile # output in the same dir wih name like inputFIle + proprocessed # wordTokenizer = RegexpTokenizer("[\w']+") finalOutputFile = open(sys.argv[1] + "_preprocessed_sentences_splitted", 'w') reviewsJSONFile = open(sys.argv[1], "r") linenumber = 0 word_tokenizer = TreebankWordTokenizer() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for line in reviewsJSONFile: if linenumber % 1000 == 0: print(linenumber) linenumber += 1 objJSON = json.loads(line) # tokenize and clean the review text reviewSTR = objJSON['reviewText'] excludeSet = string.punctuation + string.digits tokenList = [] sentList = sent_detector.tokenize(reviewSTR.strip()) for sent in sentList: # removes digits punctuations and transforms to lower case. sent = ''.join(' ' if ch in set(excludeSet) else ch.lower() for ch in sent)
from nltk import word_tokenize, TreebankWordTokenizer # usage app inputFile category prefix # output in the same dir wih name like inputFIle + proprocessed # wordTokenizer = RegexpTokenizer("[\w']+") finalOutputFile = open(sys.argv[1] + "_preprocessed", 'w') reviewsJSONFile = open(sys.argv[1], "r") prefix = sys.argv[3] linenumber = 0 dummy_name = 0 word_tokenizer = TreebankWordTokenizer() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for line in reviewsJSONFile: if linenumber % 1000 == 0: print(linenumber) linenumber += 1 objJSON = json.loads(line) # tokenize and clean the review text reviewSTR = objJSON['reviewText'] excludeSet = string.punctuation + string.digits tokenList = [] sentList = sent_detector.tokenize(reviewSTR.strip()) for sent in sentList: # removes digits punctuations and transforms to lower case. sent = ''.join(' ' if ch in set(excludeSet) else ch.lower() for ch in sent)