def fetch_concordance(hit, path, q):
    ## Determine length of text needed
    byte_distance = hit.bytes[-1] - hit.bytes[0]
    length = 1000 + byte_distance + 1000
    
    bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_text = format_strip(conc_text, bytes)
    conc_text = convert_entities(conc_text)
    start_highlight = conc_text.find('<span class="highlight"')
    m = re.search(r'<span class="highlight">[^<]*(</span>)',conc_text)
    if m:
        end_highlight = m.end(len(bytes) - 1)
        count = 0
        for char in reversed(conc_text[:start_highlight]):
            count += 1
            if count > 200 and char == ' ':
                break
        begin = start_highlight - count
        end = 0
        for char in conc_text[end_highlight:]:
            end += 1
            if end > 200 and char == ' ':
                break
        end += end_highlight
        first_span = '<span class="begin_concordance" style="display:none;">'
        second_span = '<span class="end_concordance" style="display:none;">'
        conc_text =  first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>'
    return conc_text
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True):
    within_x_words = q['word_num']    
    
    ## set up filtering of most frequent 200 terms ##
    filter_list = set([])
    if word_filter:
        filter_list_path = path + '/data/frequencies/word_frequencies'
        filter_words_file = open(filter_list_path)
        line_count = 0 
        for line in filter_words_file:
            line_count += 1
            word = line.split()[0]
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                break
    
    ## start going though hits ##
    left_collocates = defaultdict(int)
    right_collocates = defaultdict(int)
    all_collocates = defaultdict(int)
    
    count = 0
    if not full_report:
        q['colloc_start'] = None
        q['colloc_end'] = None
    for hit in results[q['colloc_start']:q['colloc_end']]:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        conc_text = unicodedata.normalize('NFC', conc_text)
        start_highlight = conc_text.find('<span class="highlight"')
        m = end_highlight_match.search(conc_text)
        end_highlight = m.end(len(m.groups()) - 1)
        conc_left = conc_text[:start_highlight]
        conc_right = conc_text[end_highlight:]
        
        left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db)
        right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db)
        
        for l_word in left_words:
            left_collocates[l_word] += 1
            all_collocates[l_word] += 1 

        for r_word in right_words:
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1    

    if full_report:
        return dict(all_collocates), dict(left_collocates), dict(right_collocates)
    else:
        return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_colloc_concordance(results, path, q, db, filter_words=100):
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore'))
    collocate_num = q['collocate_num']
    
    ## set up filtering of most frequent 200 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        #conc_text = unicodedata.normalize('NFC', conc_text)
        start_highlight = conc_text.find('<span class="highlight"')
        m = end_highlight_match.search(conc_text)
        end_highlight = m.end(len(m.groups()) - 1)
        conc_left = conc_text[:start_highlight]
        conc_right = conc_text[end_highlight:]
        if direction =='left':
            words = tokenize(conc_left, filter_list, within_x_words, direction, db)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words, direction, db)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left', db)
            words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)

        if len(new_hitlist) > (q["start"] + q["results_per_page"]):
            break
    
    h = collocation_hitlist(new_hitlist, collocate_num)
    return h
Example #4
0
def fetch_kwic(results, path, q, byte_query, start, end, length=400):
    kwic_results = []
    shortest_biblio = 0

    for hit in results[start:end]:
        biblio = hit.articleAuthor + ", " + hit.head

        ## additional clean-up for titles
        biblio = " ".join(biblio.split())  ## maybe hackish, but it works

        get_query = byte_query(hit.bytes)
        href = "./" + "/".join([str(i) for i in hit.philo_id[:4]]) + get_query

        ## Find shortest bibliography entry
        biblio = biblio
        if shortest_biblio == 0:
            shortest_biblio = len(biblio)
        if len(biblio) < shortest_biblio:
            shortest_biblio = len(biblio)

        ## Determine length of text needed
        byte_distance = hit.bytes[-1] - hit.bytes[0]
        length = 200 + byte_distance + 200

        ## Get concordance and align it
        bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = KWIC_formatter(conc_text, len(hit.bytes))
        kwic_results.append((biblio, href, conc_text, hit))

    if shortest_biblio < 20:
        shortest_biblio = 20

    ## Populate Kwic_results with bibliography
    for pos, result in enumerate(kwic_results):
        biblio, href, text, hit = result
        if len(biblio) < 20:
            diff = 20 - len(biblio)
            biblio += " " * diff
        short_biblio = '<span id="short_biblio" style="white-space:pre-wrap;">%s</span>' % biblio[:shortest_biblio]
        full_biblio = '<span id="full_biblio" style="display:none;">%s</span>' % biblio
        kwic_biblio = full_biblio + short_biblio
        kwic_biblio_link = (
            '<a href="%s" class="kwic_biblio" style="white-space:pre-wrap;">' % href + kwic_biblio + "</a>: "
        )
        kwic_results[pos] = kwic_biblio_link + '<span id="kwic_text">%s</span>' % text
    return kwic_results
Example #5
0
def fetch_relevance(hit, path, q, samples=10):
    length = 75
    text_snippet = []
    hit_num = len(hit.bytes)
    if hit_num < samples:
        byte_sample = sorted(sample(hit.bytes, hit_num))
    else:
        byte_sample = sorted(sample(hit.bytes, samples))
    if hit_num and hit_num < samples:
        length = int(length * samples / hit_num)
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        text_snippet.append(conc_text)
    text = ' ... '.join(text_snippet)
    return text
Example #6
0
def fetch_kwic(results, path, q, byte_query, db, start, end, length=5000):
    kwic_results = []
    
    default_short_citation_len = 30
    short_citation_len = 0
    for hit in results[start:end]:
        full_citation, short_citation, href = f.kwic_citation(db, hit, default_short_citation_len)
        
        ## Find longest short_citation
        if short_citation_len == 0:
            short_citation_len = len(short_citation)
        elif len(short_citation) > short_citation_len:
            short_citation_len = len(short_citation)
            
        ## Determine length of text needed
        byte_distance = hit.bytes[-1] - hit.bytes[0]
        length = length/2 + byte_distance + length/2
            
        ## Get concordance and align it
        bytes, byte_start = adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = KWIC_formatter(conc_text, len(hit.bytes))
        kwic_results.append((full_citation, short_citation, href, conc_text, hit))
    
    if short_citation_len < default_short_citation_len:
        default_short_citation_len = short_citation_len
    
    ## Populate Kwic_results with bibliography    
    for pos, result in enumerate(kwic_results):
        biblio, short_biblio, href, text, hit = result
        if len(short_biblio) < default_short_citation_len:
            diff = default_short_citation_len - len(short_biblio)
            short_biblio += '&nbsp;' * diff
        short_biblio = '<span class="short_biblio">%s</span>' % short_biblio
        full_biblio = '<span class="full_biblio" style="display:none;">%s</span>' % biblio
        kwic_biblio = full_biblio + short_biblio
        if q['format'] == "json":
            kwic_results[pos] = (kwic_biblio, text, hit.philo_id)
        else:
            kwic_biblio_link = '<a href="%s" class="kwic_biblio">' % href + kwic_biblio + '</a>: '
            kwic_results[pos] = kwic_biblio_link + '%s' % text
    return kwic_results
Example #7
0
def generate_kwic_results(db, q, config, link_to_hit="div1"):
    """ The link_to_hit keyword defines the text object to which the metadata link leads to"""
    hits = db.query(q["q"],q["method"],q["arg"],**q.metadata)
    start, end, n = f.link.page_interval(q.results_per_page, hits, q.start, q.end)
    kwic_object = {"description": {"start": start, "end": end, "results_per_page": q.results_per_page},
                    "query": dict([i for i in q])}
    kwic_results = []
    
    length = config.concordance_length
    
    for hit in hits[start - 1:end]:
        # Get all metadata
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata].strip()
        
        ## Get all links and citations
        citation_hrefs = citation_links(db, config, hit)
        citation = concordance_citation(hit, citation_hrefs)
            
        ## Determine length of text needed
        byte_distance = hit.bytes[-1] - hit.bytes[0]
        length = config.concordance_length + byte_distance + config.concordance_length
            
        ## Get concordance and align it
        bytes, byte_start = adjust_bytes(hit.bytes, config.concordance_length)
        conc_text = f.get_text(hit, byte_start, length, config.db_path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = KWIC_formatter(conc_text, len(hit.bytes))

        kwic_result = {"philo_id": hit.philo_id, "context": conc_text, "metadata_fields": metadata_fields,
                       "citation_links": citation_hrefs, "citation": citation, "bytes": hit.bytes}
        kwic_results.append(kwic_result)
    kwic_object['results'] = kwic_results
    kwic_object['results_length'] = len(hits)
    kwic_object["query_done"] = hits.done
    
    return kwic_object
def fetch_relevance(hit, path, q, samples=10):
    length = 75
    text_snippet = []
    hit_num = len(hit.bytes)
    if hit_num < samples:
        byte_sample = sorted(sample(hit.bytes, hit_num))
    else:
        byte_sample = sorted(sample(hit.bytes, samples))
    if hit_num and hit_num < samples:
        length = int(length * samples / hit_num)
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        #conc_text = re.sub('<(/?span.*?)>', '[\\1]', conc_text)
        #conc_text = re.sub('<.*?>', '', conc_text)
        #conc_text = re.sub('\[(/?span.*?)\]', '<\\1>', conc_text)
        #conc_text = re.sub('<div[^>]*>', '', conc_text)
        #conc_text = re.sub('</div>', '', conc_text)
        text_snippet.append(conc_text)
    text = ' ... '.join(text_snippet)
    return text