def fetch_concordance(hit, path, q): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 750 + byte_distance + 750 bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) start_highlight = conc_text.find('<span class="highlight"') m = re.search(r'<span class="highlight">[^<]*?(</span>)',conc_text) if m: end_highlight = m.end(len(m.groups()) - 1) count = 0 for char in reversed(conc_text[:start_highlight]): count += 1 if count > 100 and char == ' ': break begin = start_highlight - count end = 0 for char in conc_text[end_highlight:]: end += 1 if end > 200 and char == ' ': break end += end_highlight first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' return conc_text
def lookup_word_service(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + "/data/") request = WSGIHandler(db, environ) cursor = db.dbh.cursor() if request.report == "concordance": hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config["concordance_length"] * 3 hit = hits[int(request.position)] bytes = hit.bytes hit_span = hit.bytes[-1] - hit.bytes[0] length = context_size + hit_span + context_size bytes, byte_start = adjust_bytes(bytes, length) byte_end = byte_start + length filename = hit.filename token = request.selected elif request.report == "navigation": token = request.selected philo_id = request.philo_id.split(" ") text_obj = db[philo_id] byte_start, byte_end = int(text_obj.byte_start), int(text_obj.byte_end) filename = text_obj.filename # print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, byte_start, byte_end, filename else: pass # print >> sys.stderr, "TOKEN", token, "BYTES: ", byte_start, byte_end, "FILENAME: ", filename, "POSITION", request.position token_n = 0 yield lookup_word(db, cursor, token, token_n, byte_start, byte_end, filename)
def fetch_concordance(hit, path, context_size): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = context_size + byte_distance + context_size bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_concordance(conc_text, bytes) conc_text = convert_entities(conc_text) conc_text = strip_start_punctuation.sub("", conc_text) return conc_text
def fetch_concordance(db, hit, path, context_size): ## Determine length of text needed bytes = sorted(hit.bytes) byte_distance = bytes[-1] - bytes[0] length = context_size + byte_distance + context_size bytes, byte_start = adjust_bytes(bytes, context_size) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_concordance(conc_text, db.locals['word_regex'], bytes) conc_text = convert_entities(conc_text) conc_text = strip_start_punctuation.sub("", conc_text) return conc_text
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True): within_x_words = q['word_num'] ## set up filtering of most frequent 200 terms ## filter_list = set([]) if word_filter: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 if not full_report: q['colloc_start'] = None q['colloc_end'] = None for hit in results[q['colloc_start']:q['colloc_end']]: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return dict(all_collocates), dict(left_collocates), dict(right_collocates) else: return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_colloc_concordance(results, path, q, db, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering of most frequent 200 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_kwic(results, path, q, byte_query, db, start, end, length=500): kwic_results = [] shortest_biblio = 0 for hit in results[start:end]: biblio = hit.author + ', ' + hit.title ## additional clean-up for titles biblio = ' '.join(biblio.split()) ## maybe hackish, but it works get_query = byte_query(hit.bytes) href = "./" + '/'.join([str(i) for i in hit.philo_id[:2]]) + get_query ## Find shortest bibliography entry biblio = biblio if shortest_biblio == 0: shortest_biblio = len(biblio) if len(biblio) < shortest_biblio: shortest_biblio = len(biblio) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = length/2 + byte_distance + length/2 ## Get concordance and align it bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_results.append((biblio, href, conc_text, hit)) if shortest_biblio < 20: shortest_biblio = 20 ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, href, text, hit = result if len(biblio) < 20: diff = 20 - len(biblio) biblio += ' ' * diff short_biblio = '<span id="short_biblio" style="white-space:pre-wrap;">%s</span>' % biblio[:shortest_biblio] full_biblio = '<span id="full_biblio" style="display:none;">%s</span>' % biblio kwic_biblio = full_biblio + short_biblio kwic_biblio_link = '<a href="%s" class="kwic_biblio" style="white-space:pre-wrap;">' % href + kwic_biblio + '</a>: ' kwic_results[pos] = kwic_biblio_link + '<span id="kwic_text">%s</span>' % text return kwic_results
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] hit_num = len(hit.bytes) if hit_num < samples: byte_sample = sorted(sample(hit.bytes, hit_num)) else: byte_sample = sorted(sample(hit.bytes, samples)) if hit_num and hit_num < samples: length = int(length * samples / hit_num) for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def fetch_kwic(results, path, q, byte_query, db, start, end, length=5000): kwic_results = [] default_short_citation_len = 30 short_citation_len = 0 for hit in results[start:end]: full_citation, short_citation, href = f.kwic_citation(db, hit, default_short_citation_len) ## Find longest short_citation if short_citation_len == 0: short_citation_len = len(short_citation) elif len(short_citation) > short_citation_len: short_citation_len = len(short_citation) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = length/2 + byte_distance + length/2 ## Get concordance and align it bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_results.append((full_citation, short_citation, href, conc_text, hit)) if short_citation_len < default_short_citation_len: default_short_citation_len = short_citation_len ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, short_biblio, href, text, hit = result if len(short_biblio) < default_short_citation_len: diff = default_short_citation_len - len(short_biblio) short_biblio += ' ' * diff short_biblio = '<span class="short_biblio">%s</span>' % short_biblio full_biblio = '<span class="full_biblio" style="display:none;">%s</span>' % biblio kwic_biblio = full_biblio + short_biblio if q['format'] == "json": kwic_results[pos] = (kwic_biblio, text, hit.philo_id) else: kwic_biblio_link = '<a href="%s" class="kwic_biblio">' % href + kwic_biblio + '</a>: ' kwic_results[pos] = kwic_biblio_link + '%s' % text return kwic_results
def generate_kwic_results(db, q, config, link_to_hit="div1"): """ The link_to_hit keyword defines the text object to which the metadata link leads to""" hits = db.query(q["q"],q["method"],q["arg"],**q.metadata) start, end, n = f.link.page_interval(q.results_per_page, hits, q.start, q.end) kwic_object = {"description": {"start": start, "end": end, "results_per_page": q.results_per_page}, "query": dict([i for i in q])} kwic_results = [] length = config.concordance_length for hit in hits[start - 1:end]: # Get all metadata metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata].strip() ## Get all links and citations citation_hrefs = citation_links(db, config, hit) citation = concordance_citation(hit, citation_hrefs) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = config.concordance_length + byte_distance + config.concordance_length ## Get concordance and align it bytes, byte_start = adjust_bytes(hit.bytes, config.concordance_length) conc_text = f.get_text(hit, byte_start, length, config.db_path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_result = {"philo_id": hit.philo_id, "context": conc_text, "metadata_fields": metadata_fields, "citation_links": citation_hrefs, "citation": citation, "bytes": hit.bytes} kwic_results.append(kwic_result) kwic_object['results'] = kwic_results kwic_object['results_length'] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode( 'utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode( 'utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction == 'left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend( tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True, stopwords=True): config = f.WebConfig() length = config['concordance_length'] within_x_words = q['word_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 try: word = line.split()[0] except IndexError: continue filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 for hit in results[q['interval_start']:q['interval_end']]: bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode( 'utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode( 'utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return all_collocates, left_collocates, right_collocates else: return all_collocates
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True, stopwords=True): config = f.WebConfig() length = config['concordance_length'] within_x_words = q['word_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 try: word = line.split()[0] except IndexError: continue filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 for hit in results[q['interval_start']:q['interval_end']]: bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return all_collocates, left_collocates, right_collocates else: return all_collocates