def fetch_relevance(hit, path, q, kwic=True, samples=3): length = 400 text_snippet = [] if len(hit.bytes) >= samples: byte_sample = sample(hit.bytes, samples) else: byte_sample = hit.bytes for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic) conc_start = clean_text(conc_start, kwic=kwic) conc_end = clean_text(conc_end, kwic=kwic) if kwic: conc_middle = clean_text(conc_middle, notag=False, kwic=kwic) conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') conc_text = align_text(conc_text, 1) else: conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') text_snippet.append(conc_text) if kwic: text = '<br>\n'.join(text_snippet) else: text = '... '.join(text_snippet) return text
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] byte_sample = hit.bytes[:samples] for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True) conc_start = clean_text(conc_start) conc_end = clean_text(conc_end) conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def tokenize(text, filter_list, within_x_words, direction, highlighting=False): text = clean_text(text, collocation=True) text = text.lower() if direction == 'left': text = left_truncate.sub("", text) ## hack off left-most word (potentially truncated) word_list = tokenize_text(text) word_list.reverse() ## left side needs to be reversed else: text = right_truncate.sub("", text) ## hack off right-most word (potentially truncated) word_list = tokenize_text(text) word_list = filter(word_list, filter_list, within_x_words) return word_list
def tokenize(text, filter_list, within_x_words, direction, highlighting=False): text = clean_text(text, collocation=True) text = text.lower() if direction == 'left': text = re.sub("^[^\s]* ", "", text) ## hack off left-most word (potentially truncated) word_list = tokenize_text(text) word_list.reverse() ## left side needs to be reversed else: text = re.sub( " [^\s]*$", "", text) ## hack off right-most word (potentially truncated) word_list = tokenize_text(text) word_list = filter(word_list, filter_list, within_x_words) return word_list