def fetch_relevance(hit, path, q, kwic=True, samples=3):
    length = 400
    text_snippet = []
    if len(hit.bytes) >= samples:
        byte_sample = sample(hit.bytes, samples)
    else:
        byte_sample = hit.bytes
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic)
        conc_start = clean_text(conc_start, kwic=kwic)
        conc_end = clean_text(conc_end, kwic=kwic)
        if kwic:
            conc_middle = clean_text(conc_middle, notag=False, kwic=kwic)
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
            conc_text = align_text(conc_text, 1)
        else:
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    if kwic:
        text = '<br>\n'.join(text_snippet)
    else:
        text = '... '.join(text_snippet)
    return text
    
     
def fetch_relevance(hit, path, q, kwic=True, samples=3):
    length = 400
    text_snippet = []
    if len(hit.bytes) >= samples:
        byte_sample = sample(hit.bytes, samples)
    else:
        byte_sample = hit.bytes
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic)
        conc_start = clean_text(conc_start, kwic=kwic)
        conc_end = clean_text(conc_end, kwic=kwic)
        if kwic:
            conc_middle = clean_text(conc_middle, notag=False, kwic=kwic)
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
            conc_text = align_text(conc_text, 1)
        else:
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    if kwic:
        text = '<br>\n'.join(text_snippet)
    else:
        text = '... '.join(text_snippet)
    return text
def fetch_relevance(hit, path, q, samples=10):
    length = 75
    text_snippet = []
    byte_sample = hit.bytes[:samples]
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True)
        conc_start = clean_text(conc_start)
        conc_end = clean_text(conc_end)
        conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    text = ' ... '.join(text_snippet)
    return text
def tokenize(text, filter_list, within_x_words, direction, highlighting=False):
    text = clean_text(text, collocation=True)
    text = text.lower()
    
    if direction == 'left':
        text = left_truncate.sub("", text) ## hack off left-most word (potentially truncated)
        word_list = tokenize_text(text) 
        word_list.reverse() ## left side needs to be reversed
    else:
        text = right_truncate.sub("", text) ## hack off right-most word (potentially truncated)
        word_list = tokenize_text(text)
        
    word_list = filter(word_list, filter_list, within_x_words)

    return word_list
Beispiel #5
0
def tokenize(text, filter_list, within_x_words, direction, highlighting=False):
    text = clean_text(text, collocation=True)
    text = text.lower()

    if direction == 'left':
        text = re.sub("^[^\s]* ", "",
                      text)  ## hack off left-most word (potentially truncated)
        word_list = tokenize_text(text)
        word_list.reverse()  ## left side needs to be reversed
    else:
        text = re.sub(
            " [^\s]*$", "",
            text)  ## hack off right-most word (potentially truncated)
        word_list = tokenize_text(text)

    word_list = filter(word_list, filter_list, within_x_words)

    return word_list