def get_text_data(text, expect_labels=True, tokenize=False): """ get text, returning an instance of the Doc class doc.frag is the first frag, and each points to the next """ frag_list = None word_index = 0 frag_index = 0 curr_words = [] lower_words, non_abbrs = util.Counter(), util.Counter() for line in text.splitlines(): if (not line.strip()) and (not curr_words) and frag_list: frag.ends_seg = True for word in line.split(): curr_words.append(word) if is_sbd_hyp(word): frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag ## get label; tokenize if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] word_index += 1 ## last frag frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 sys.stderr.write(' words [%d] sbd hyps [%d]\n' % (word_index, frag_index)) ## create a Doc object to hold all this information doc = Doc(frag_list) return doc
def get_text_data(text, expect_labels=True, tokenize=False): """ get text, returning an instance of the Doc class doc.frag is the first frag, and each points to the next """ frag_list = None word_index = 0 frag_index = 0 curr_words = [] lower_words, non_abbrs = util.Counter(), util.Counter() for line in text.splitlines(): if (not line.strip()) and (not curr_words) and frag_list: frag.ends_seg = True for word in line.split(): curr_words.append(word) if is_sbd_hyp(word): frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag ## get label; tokenize if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] word_index += 1 ## last frag frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 sys.stderr.write(' words [%d] sbd hyps [%d]\n' %(word_index, frag_index)) ## create a Doc object to hold all this information doc = Doc(frag_list) return doc
def read_doc( path ): doc = open(path).read() tokens = [] for token in tokenize(doc).split(): if token.isalpha(): tokens.append(clean_token(token)) return tokens
def get_data(files, expect_labels=True, tokenize=False, verbose=False): """ load text from files, returning an instance of the Doc class doc.frag is the first frag, and each points to the next """ if type(files) == type(''): files = [files] frag_list = None word_index = 0 frag_index = 0 curr_words = [] lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter() for file in files: sys.stderr.write('reading [%s]\n' % file) fh = open(file) for line in fh: ## deal with blank lines if (not line.strip()) and frag_list: if not curr_words: frag.ends_seg = True else: frag = Frag(' '.join(curr_words)) frag.ends_seg = True if expect_labels: frag.label = True prev.next = frag if tokenize: tokens = word_tokenize.tokenize(frag.orig) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] for word in line.split(): curr_words.append(word) if is_sbd_hyp(word): #if True: # hypothesize all words frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag ## get label; tokenize if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] word_index += 1 fh.close() ## last frag frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag.ends_seg = True frag_index += 1 if verbose: sys.stderr.write(' words [%d] sbd hyps [%d]\n' % (word_index, frag_index)) ## create a Doc object to hold all this information doc = Doc(frag_list) return doc
def get_data(files, expect_labels=True, tokenize=False, verbose=False, files_already_opened=False): """ load text from files, returning an instance of the Doc class doc.frag is the first frag, and each points to the next """ if type(files) == type(''): files = [files] frag_list = None word_index = 0 frag_index = 0 curr_words = [] lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter() for file in files: sys.stderr.write('reading [%s]\n' %file) #fh = open(file) if files_already_opened: fh = file else: fh = open(file) for line in fh: ## deal with blank lines if (not line.strip()) and frag_list: if not curr_words: frag.ends_seg = True else: frag = Frag(' '.join(curr_words)) frag.ends_seg = True if expect_labels: frag.label = True prev.next = frag if tokenize: tokens = word_tokenize.tokenize(frag.orig) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] for word in line.split(): curr_words.append(word) if is_sbd_hyp(word): #if True: # hypothesize all words frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag ## get label; tokenize if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] word_index += 1 if files_already_opened: pass else: fh.close() #fh.close() ## last frag frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag.ends_seg = True frag_index += 1 if verbose: sys.stderr.write(' words [%d] sbd hyps [%d]\n' %(word_index, frag_index)) ## create a Doc object to hold all this information doc = Doc(frag_list) return doc
def get_text_data(text, expect_labels=True, tokenize=False, verbose=False): """ get text, returning an instance of the Doc class doc.frag is the first frag, and each points to the next """ frag_list = None word_index = 0 frag_index = 0 curr_words = [] lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter() for line in text.splitlines(): ## deal with blank lines if (not line.strip()) and frag_list: if not curr_words: frag.ends_seg = True else: frag = Frag(' '.join(curr_words)) frag.ends_seg = True if expect_labels: frag.label = True prev.next = frag if tokenize: tokens = word_tokenize.tokenize(frag.orig) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] for word in line.split(): curr_words.append(word) if is_sbd_hyp(word): frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag ## get label; tokenize if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) # BJD possible hack, but pretty sure this is needed tmp_tokens = tokens.split() tokens = ' '.join(tmp_tokens[:-1] + re.split(r'([.?!]+["\')\]]*)$', tmp_tokens[-1])) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] word_index += 1 ## last frag frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag.ends_seg = True frag_index += 1 if verbose: sys.stderr.write(' words [%d] sbd hyps [%d]\n' %(word_index, frag_index)) ## create a Doc object to hold all this information doc = Doc(frag_list) return doc