def fetch_concordance(hit, path, q): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 1000 + byte_distance + 1000 bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) start_highlight = conc_text.find('<span class="highlight"') m = re.search(r'<span class="highlight">[^<]*(</span>)',conc_text) if m: end_highlight = m.end(len(bytes) - 1) count = 0 for char in reversed(conc_text[:start_highlight]): count += 1 if count > 200 and char == ' ': break begin = start_highlight - count end = 0 for char in conc_text[end_highlight:]: end += 1 if end > 200 and char == ' ': break end += end_highlight first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' return conc_text
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True): within_x_words = q['word_num'] ## set up filtering of most frequent 200 terms ## filter_list = set([]) if word_filter: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 if not full_report: q['colloc_start'] = None q['colloc_end'] = None for hit in results[q['colloc_start']:q['colloc_end']]: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return dict(all_collocates), dict(left_collocates), dict(right_collocates) else: return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_colloc_concordance(results, path, q, db, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering of most frequent 200 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_kwic(results, path, q, byte_query, start, end, length=400): kwic_results = [] shortest_biblio = 0 for hit in results[start:end]: biblio = hit.articleAuthor + ", " + hit.head ## additional clean-up for titles biblio = " ".join(biblio.split()) ## maybe hackish, but it works get_query = byte_query(hit.bytes) href = "./" + "/".join([str(i) for i in hit.philo_id[:4]]) + get_query ## Find shortest bibliography entry biblio = biblio if shortest_biblio == 0: shortest_biblio = len(biblio) if len(biblio) < shortest_biblio: shortest_biblio = len(biblio) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 200 + byte_distance + 200 ## Get concordance and align it bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_results.append((biblio, href, conc_text, hit)) if shortest_biblio < 20: shortest_biblio = 20 ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, href, text, hit = result if len(biblio) < 20: diff = 20 - len(biblio) biblio += " " * diff short_biblio = '<span id="short_biblio" style="white-space:pre-wrap;">%s</span>' % biblio[:shortest_biblio] full_biblio = '<span id="full_biblio" style="display:none;">%s</span>' % biblio kwic_biblio = full_biblio + short_biblio kwic_biblio_link = ( '<a href="%s" class="kwic_biblio" style="white-space:pre-wrap;">' % href + kwic_biblio + "</a>: " ) kwic_results[pos] = kwic_biblio_link + '<span id="kwic_text">%s</span>' % text return kwic_results
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] hit_num = len(hit.bytes) if hit_num < samples: byte_sample = sorted(sample(hit.bytes, hit_num)) else: byte_sample = sorted(sample(hit.bytes, samples)) if hit_num and hit_num < samples: length = int(length * samples / hit_num) for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def fetch_kwic(results, path, q, byte_query, db, start, end, length=5000): kwic_results = [] default_short_citation_len = 30 short_citation_len = 0 for hit in results[start:end]: full_citation, short_citation, href = f.kwic_citation(db, hit, default_short_citation_len) ## Find longest short_citation if short_citation_len == 0: short_citation_len = len(short_citation) elif len(short_citation) > short_citation_len: short_citation_len = len(short_citation) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = length/2 + byte_distance + length/2 ## Get concordance and align it bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_results.append((full_citation, short_citation, href, conc_text, hit)) if short_citation_len < default_short_citation_len: default_short_citation_len = short_citation_len ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, short_biblio, href, text, hit = result if len(short_biblio) < default_short_citation_len: diff = default_short_citation_len - len(short_biblio) short_biblio += ' ' * diff short_biblio = '<span class="short_biblio">%s</span>' % short_biblio full_biblio = '<span class="full_biblio" style="display:none;">%s</span>' % biblio kwic_biblio = full_biblio + short_biblio if q['format'] == "json": kwic_results[pos] = (kwic_biblio, text, hit.philo_id) else: kwic_biblio_link = '<a href="%s" class="kwic_biblio">' % href + kwic_biblio + '</a>: ' kwic_results[pos] = kwic_biblio_link + '%s' % text return kwic_results
def generate_kwic_results(db, q, config, link_to_hit="div1"): """ The link_to_hit keyword defines the text object to which the metadata link leads to""" hits = db.query(q["q"],q["method"],q["arg"],**q.metadata) start, end, n = f.link.page_interval(q.results_per_page, hits, q.start, q.end) kwic_object = {"description": {"start": start, "end": end, "results_per_page": q.results_per_page}, "query": dict([i for i in q])} kwic_results = [] length = config.concordance_length for hit in hits[start - 1:end]: # Get all metadata metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata].strip() ## Get all links and citations citation_hrefs = citation_links(db, config, hit) citation = concordance_citation(hit, citation_hrefs) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = config.concordance_length + byte_distance + config.concordance_length ## Get concordance and align it bytes, byte_start = adjust_bytes(hit.bytes, config.concordance_length) conc_text = f.get_text(hit, byte_start, length, config.db_path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_result = {"philo_id": hit.philo_id, "context": conc_text, "metadata_fields": metadata_fields, "citation_links": citation_hrefs, "citation": citation, "bytes": hit.bytes} kwic_results.append(kwic_result) kwic_object['results'] = kwic_results kwic_object['results_length'] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] hit_num = len(hit.bytes) if hit_num < samples: byte_sample = sorted(sample(hit.bytes, hit_num)) else: byte_sample = sorted(sample(hit.bytes, samples)) if hit_num and hit_num < samples: length = int(length * samples / hit_num) for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = re.sub('<(/?span.*?)>', '[\\1]', conc_text) #conc_text = re.sub('<.*?>', '', conc_text) #conc_text = re.sub('\[(/?span.*?)\]', '<\\1>', conc_text) #conc_text = re.sub('<div[^>]*>', '', conc_text) #conc_text = re.sub('</div>', '', conc_text) text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text