def extract_tokens_from_file(self, responses, input_filename, n, token_dict): Y = fh.read_csv(input_filename) rids = Y.index dataset = fh.get_basename(input_filename) for rid in rids: if rid in responses: text = responses[rid].lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = tokenizer.make_ngrams(s, n) #sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] #sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] if self.params['source'] != 'normalized': tokens = [t + '_<' + self.params['source'] + '>' for t in tokens] if self.params['append_dataset']: tokens = [t + '_' + dataset for t in tokens] token_dict[rid] = tokens else: token_dict[rid] = []
def extract_tokens_from_text(self, data, items_to_load, doc_index=None): token_dict = {} for key in items_to_load: if doc_index is not None: doc_key = doc_index[key]['filename'] else: doc_key = key text = data[doc_key] if self.lower: text = text.lower() text = text.lstrip() text = text.rstrip() if self.replace_num is not None: text = re.sub('\d', self.replace_num, text) tokens = [] sentences = text.split('\n') if doc_index is None: for s in sentences: sent_tokens = tokenizer.make_ngrams(s, self.n, replace_numbers=False) tokens = tokens + sent_tokens elif 'sentences' not in doc_index[key]: for s in sentences: sent_tokens = tokenizer.make_ngrams(s, self.n, replace_numbers=False) tokens = tokens + sent_tokens else: for i in doc_index[key]['sentences']: s = sentences[int(i)] sent_tokens = tokenizer.make_ngrams(s, self.n, replace_numbers=False) tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] token_dict[key] = tokens return token_dict
def extract_tokens_from_text(self, data): token_dict = {} for key, text in data.items(): text = text.lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = tokenizer.make_ngrams(s, self.n) tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] token_dict[key] = tokens return token_dict
def extract_tokens_from_file(self, data, n): token_dict = {} for key, text in data.items(): text = text.lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = tokenizer.make_ngrams(s, n) tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] token_dict[key] = tokens return token_dict
def extract_tokens_from_file(self, responses, input_filename, n, cluster_dict, token_dict): Y = fh.read_csv(input_filename) rids = Y.index for rid in rids: text = responses[rid].lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = tokenizer.make_ngrams(s, n) sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] sent_tokens = sent_tokens + ['__ENDS__'] tokens = tokens + sent_tokens tokens = [self.get_prefix() + cluster_dict[t] for t in tokens if t in cluster_dict] token_dict[rid] = tokens