def fetch_concordance(hit, path, q): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 1000 + byte_distance + 1000 bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) start_highlight = conc_text.find('<span class="highlight"') m = re.search(r'<span class="highlight">[^<]*(</span>)',conc_text) if m: end_highlight = m.end(len(bytes) - 1) count = 0 for char in reversed(conc_text[:start_highlight]): count += 1 if count > 200 and char == ' ': break begin = start_highlight - count end = 0 for char in conc_text[end_highlight:]: end += 1 if end > 200 and char == ' ': break end += end_highlight first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' return conc_text
def fetch_relevance(hit, path, q, kwic=True, samples=3): length = 400 text_snippet = [] if len(hit.bytes) >= samples: byte_sample = sample(hit.bytes, samples) else: byte_sample = hit.bytes for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic) conc_start = clean_text(conc_start, kwic=kwic) conc_end = clean_text(conc_end, kwic=kwic) if kwic: conc_middle = clean_text(conc_middle, notag=False, kwic=kwic) conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') conc_text = align_text(conc_text, 1) else: conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') text_snippet.append(conc_text) if kwic: text = '<br>\n'.join(text_snippet) else: text = '... '.join(text_snippet) return text
def fetch_collocation(results, path, q, filter_words=100): within_x_words = q['word_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break ## start going though hits ## left_collocates = {} right_collocates = {} all_collocates = {} count = 0 for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) left_words = tokenize(conc_left, filter_list, within_x_words, 'left') right_words = tokenize(conc_right, filter_list, within_x_words, 'right') for l_word in left_words: if l_word == q['q']: continue if l_word not in left_collocates: left_collocates[l_word] = 0 left_collocates[l_word] += 1 if l_word not in all_collocates: all_collocates[l_word] = 0 all_collocates[l_word] += 1 for r_word in right_words: if r_word == q['q']: continue if r_word not in right_collocates: right_collocates[r_word] = 0 if r_word not in all_collocates: all_collocates[r_word] = 0 right_collocates[r_word] += 1 all_collocates[r_word] += 1 left_out = sorted(left_collocates.items(), key=lambda x: x[1], reverse=True)[:100] right_out = sorted(right_collocates.items(), key=lambda x: x[1], reverse=True)[:100] all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)[:100] tuple_out = zip(all_out, left_out, right_out) return tuple_out
def fetch_concordance(hit, path, context_size): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = context_size + byte_distance + context_size bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_concordance(conc_text, bytes) conc_text = convert_entities(conc_text) conc_text = strip_start_punctuation.sub("", conc_text) return conc_text
def fetch_concordance(db, hit, path, context_size): ## Determine length of text needed bytes = sorted(hit.bytes) byte_distance = bytes[-1] - bytes[0] length = context_size + byte_distance + context_size bytes, byte_start = adjust_bytes(bytes, context_size) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_concordance(conc_text, db.locals['word_regex'], bytes) conc_text = convert_entities(conc_text) conc_text = strip_start_punctuation.sub("", conc_text) return conc_text
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True): within_x_words = q['word_num'] ## set up filtering of most frequent 200 terms ## filter_list = set([]) if word_filter: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 if not full_report: q['colloc_start'] = None q['colloc_end'] = None for hit in results[q['colloc_start']:q['colloc_end']]: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return dict(all_collocates), dict(left_collocates), dict(right_collocates) else: return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_collocation(results, path, q, word_filter=True, filter_num=200, full_report=True): within_x_words = q['word_num'] ## set up filtering of most frequent 200 terms ## filter_list = set([]) if word_filter: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 if not full_report: q['colloc_start'] = None q['colloc_end'] = None for hit in results[q['colloc_start']:q['colloc_end']]: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) left_words = tokenize(conc_left, filter_list, within_x_words, 'left') right_words = tokenize(conc_right, filter_list, within_x_words, 'right') query_words = set([w.decode('utf-8') for w in q['q'].split('|')]) for l_word in left_words: if l_word in query_words: continue left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: if r_word in query_words: continue right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return dict(all_collocates), dict(left_collocates), dict(right_collocates) else: return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_colloc_concordance(results, path, q, db, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering of most frequent 200 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_concordance(hit, path, q, length=2000): bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True) conc_start = f.format.clean_text(conc_start) conc_end = f.format.clean_text(conc_end) conc_text = conc_start + conc_middle + conc_end conc_text = conc_text.decode('utf-8', 'ignore') highlight_index = conc_text.find('<span class="highlight"') begin = highlight_index - 200 ## make sure the highlighted term does not get hidden end = highlight_index + 200 first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' return conc_text
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] byte_sample = hit.bytes[:samples] for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True) conc_start = clean_text(conc_start) conc_end = clean_text(conc_end) conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def fetch_kwic(results, path, q, byte_query, start, end, length=400): kwic_results = [] shortest_biblio = 0 for hit in results[start:end]: biblio = hit.articleAuthor + ", " + hit.head ## additional clean-up for titles biblio = " ".join(biblio.split()) ## maybe hackish, but it works get_query = byte_query(hit.bytes) href = "./" + "/".join([str(i) for i in hit.philo_id[:4]]) + get_query ## Find shortest bibliography entry biblio = biblio if shortest_biblio == 0: shortest_biblio = len(biblio) if len(biblio) < shortest_biblio: shortest_biblio = len(biblio) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 200 + byte_distance + 200 ## Get concordance and align it bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_results.append((biblio, href, conc_text, hit)) if shortest_biblio < 20: shortest_biblio = 20 ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, href, text, hit = result if len(biblio) < 20: diff = 20 - len(biblio) biblio += " " * diff short_biblio = '<span id="short_biblio" style="white-space:pre-wrap;">%s</span>' % biblio[:shortest_biblio] full_biblio = '<span id="full_biblio" style="display:none;">%s</span>' % biblio kwic_biblio = full_biblio + short_biblio kwic_biblio_link = ( '<a href="%s" class="kwic_biblio" style="white-space:pre-wrap;">' % href + kwic_biblio + "</a>: " ) kwic_results[pos] = kwic_biblio_link + '<span id="kwic_text">%s</span>' % text return kwic_results
def fetch_kwic(results, path, q, byte_query, start, end, length=400): kwic_results = [] shortest_biblio = 0 for hit in results[start:end]: biblio = hit.author + ', ' + hit.title ## additional clean-up for titles biblio = ' '.join(biblio.split()) ## maybe hackish, but it works get_query = byte_query(hit.bytes) href = "./" + '/'.join([str(i) for i in hit.philo_id[:5]]) + get_query ## Find shortest bibliography entry biblio = biblio if shortest_biblio == 0: shortest_biblio = len(biblio) if len(biblio) < shortest_biblio: shortest_biblio = len(biblio) ## Get concordance and align it bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True, kwic=True) conc_start = f.format.clean_text(conc_start, kwic=True) conc_end = f.format.clean_text(conc_end, kwic=True) conc_middle = f.format.clean_text(conc_middle, notag=False, kwic=True) conc_text = (conc_start + conc_middle + conc_end).decode( 'utf-8', 'ignore') conc_text = f.format.align_text(conc_text, len(hit.bytes)) kwic_results.append((biblio, href, conc_text, hit)) ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, href, text, hit = result short_biblio = '<span id="short_biblio">%s</span>' % biblio[: shortest_biblio] end_biblio = '<span id="end_biblio" style="display:none;">%s</span>' % biblio[ shortest_biblio:] full_biblio = short_biblio + end_biblio full_biblio = '<a href="%s" id="kwic_biblio" style="white-space:pre-wrap;">' % href + full_biblio + '</a>: ' kwic_results[pos] = (full_biblio + text, hit) return kwic_results
def fetch_kwic(results, path, q, byte_query, start, end, length=400): kwic_results = [] shortest_biblio = 0 for hit in results[start:end]: biblio = hit.author + ', ' + hit.title ## additional clean-up for titles biblio = ' '.join(biblio.split()) ## maybe hackish, but it works get_query = byte_query(hit.bytes) href = "./" + '/'.join([str(i) for i in hit.philo_id[:4]]) + get_query ## Find shortest bibliography entry biblio = biblio if shortest_biblio == 0: shortest_biblio = len(biblio) if len(biblio) < shortest_biblio: shortest_biblio = len(biblio) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 200 + byte_distance + 200 ## Get concordance and align it bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True, kwic=True) conc_start = f.format.clean_text(conc_start, kwic=True) conc_end = f.format.clean_text(conc_end, kwic=True) conc_middle = f.format.clean_text(conc_middle, notag=False, kwic=True) conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') conc_text = f.format.align_text(conc_text, len(hit.bytes)) kwic_results.append((biblio, href, conc_text, hit)) ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, href, text, hit = result short_biblio = '<span id="short_biblio">%s</span>' % biblio[:shortest_biblio] full_biblio = '<span id="full_biblio" style="display:none;">%s</span>' % biblio kwic_biblio = full_biblio + short_biblio kwic_biblio_link = '<a href="%s" class="kwic_biblio" style="white-space:pre-wrap;">' % href + kwic_biblio + '</a>: ' kwic_results[pos] = kwic_biblio_link + '<span id="kwic_text">%s</span>' % text return kwic_results
def fetch_colloc_concordance(results, path, q, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = q['collocate'].decode('utf-8', 'ignore') collocate_num = q['collocate_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction) else: words = tokenize(conc_left, filter_list, within_x_words, 'left') words.extend(tokenize(conc_right, filter_list, within_x_words, 'right')) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break return collocation_hitlist(new_hitlist, collocate_num)
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] hit_num = len(hit.bytes) if hit_num < samples: byte_sample = sorted(sample(hit.bytes, hit_num)) else: byte_sample = sorted(sample(hit.bytes, samples)) if hit_num and hit_num < samples: length = int(length * samples / hit_num) for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def fetch_kwic(results, path, q, byte_query, db, start, end, length=5000): kwic_results = [] default_short_citation_len = 30 short_citation_len = 0 for hit in results[start:end]: full_citation, short_citation, href = f.kwic_citation(db, hit, default_short_citation_len) ## Find longest short_citation if short_citation_len == 0: short_citation_len = len(short_citation) elif len(short_citation) > short_citation_len: short_citation_len = len(short_citation) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = length/2 + byte_distance + length/2 ## Get concordance and align it bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_results.append((full_citation, short_citation, href, conc_text, hit)) if short_citation_len < default_short_citation_len: default_short_citation_len = short_citation_len ## Populate Kwic_results with bibliography for pos, result in enumerate(kwic_results): biblio, short_biblio, href, text, hit = result if len(short_biblio) < default_short_citation_len: diff = default_short_citation_len - len(short_biblio) short_biblio += ' ' * diff short_biblio = '<span class="short_biblio">%s</span>' % short_biblio full_biblio = '<span class="full_biblio" style="display:none;">%s</span>' % biblio kwic_biblio = full_biblio + short_biblio if q['format'] == "json": kwic_results[pos] = (kwic_biblio, text, hit.philo_id) else: kwic_biblio_link = '<a href="%s" class="kwic_biblio">' % href + kwic_biblio + '</a>: ' kwic_results[pos] = kwic_biblio_link + '%s' % text return kwic_results
def fetch_colloc_concordance(results, path, q, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = q['collocate'] collocate_num = q['collocate_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction) else: words = tokenize(conc_left, filter_list, within_x_words, 'left') words.extend(tokenize(conc_right, filter_list, within_x_words, 'right')) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) return collocation_hitlist(new_hitlist, collocate_num)
def generate_kwic_results(db, q, config, link_to_hit="div1"): """ The link_to_hit keyword defines the text object to which the metadata link leads to""" hits = db.query(q["q"],q["method"],q["arg"],**q.metadata) start, end, n = f.link.page_interval(q.results_per_page, hits, q.start, q.end) kwic_object = {"description": {"start": start, "end": end, "results_per_page": q.results_per_page}, "query": dict([i for i in q])} kwic_results = [] length = config.concordance_length for hit in hits[start - 1:end]: # Get all metadata metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata].strip() ## Get all links and citations citation_hrefs = citation_links(db, config, hit) citation = concordance_citation(hit, citation_hrefs) ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = config.concordance_length + byte_distance + config.concordance_length ## Get concordance and align it bytes, byte_start = adjust_bytes(hit.bytes, config.concordance_length) conc_text = f.get_text(hit, byte_start, length, config.db_path) conc_text = format_strip(conc_text, bytes) conc_text = KWIC_formatter(conc_text, len(hit.bytes)) kwic_result = {"philo_id": hit.philo_id, "context": conc_text, "metadata_fields": metadata_fields, "citation_links": citation_hrefs, "citation": citation, "bytes": hit.bytes} kwic_results.append(kwic_result) kwic_object['results'] = kwic_results kwic_object['results_length'] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def fetch_concordance(hit, path, q, length=2000): bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True) conc_start = f.format.clean_text(conc_start) conc_end = f.format.clean_text(conc_end) conc_text = conc_start + conc_middle + conc_end conc_text = conc_text.decode('utf-8', 'ignore') highlight_index = conc_text.find('<span class="highlight"') begin = highlight_index - 200 ## make sure the highlighted term does not get hidden end = highlight_index + 200 first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' split_text = re.split(r"([^ \.,;:?!\"\n\r\t\(\)]+)|([\.;:?!])", conc_text) keep_text = [] for w in split_text: if w: if w.lower() == q['collocate']: w = '<span class="collocate">%s</span>' % w keep_text.append(w) conc_text = ''.join(keep_text) return conc_text
def fetch_concordance(hit, path, q): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 1000 + byte_distance + 1000 bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True) conc_start = f.format.clean_text(conc_start) conc_end = f.format.clean_text(conc_end) conc_text = conc_start + conc_middle + conc_end conc_text = conc_text.decode('utf-8', 'ignore') start_highlight = conc_text.find('<span class="highlight"') end_highlight = conc_text.rfind('</span>') begin = start_highlight - 200 end = end_highlight + 200 min = bytes[-1] + len("<span class='highlight'></span>") * len(bytes) if end < min: end = min first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' return conc_text
def fetch_concordance(hit, path, q, length=2000): bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True) conc_start = f.format.clean_text(conc_start) conc_end = f.format.clean_text(conc_end) conc_text = conc_start + conc_middle + conc_end conc_text = conc_text.decode('utf-8', 'ignore') highlight_index = conc_text.find('<span class="highlight"') begin = highlight_index - 200 ## make sure the highlighted term does not get hidden end = highlight_index + 200 first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' split_text = re.split(r"([^ \.,;:?!\"\n\r\t\(\)]+)|([\.;:?!])", conc_text) keep_text = [] for w in split_text: if w: if w.lower() == q['collocate'].decode('utf-8', 'ignore'): w = '<span class="collocate">%s</span>' % w keep_text.append(w) conc_text = ''.join(keep_text) return conc_text
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] hit_num = len(hit.bytes) if hit_num < samples: byte_sample = sorted(sample(hit.bytes, hit_num)) else: byte_sample = sorted(sample(hit.bytes, samples)) if hit_num and hit_num < samples: length = int(length * samples / hit_num) for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = re.sub('<(/?span.*?)>', '[\\1]', conc_text) #conc_text = re.sub('<.*?>', '', conc_text) #conc_text = re.sub('\[(/?span.*?)\]', '<\\1>', conc_text) #conc_text = re.sub('<div[^>]*>', '', conc_text) #conc_text = re.sub('</div>', '', conc_text) text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
tf_idf.fit_transform(corpus) return tf_idf def get_keywords(text, tfidf_vectorizer, score_limit=0.15): vector = tfidf_vectorizer.transform([text]) words = tfidf_vectorizer.get_feature_names() tfidf_scores = vector.todense().tolist()[0] all_keywords = [(word, tfidf_scores[idx]) for idx, word in enumerate(words)] all_keywords = sorted(all_keywords, key=lambda x: x[1], reverse=True) top_keywords = list(filter(lambda x: x[1] >= score_limit, all_keywords)) return top_keywords hh_vectorizer = prepare_tfidf_vectorizer('lemmatized_skills100k.txt') news_vectorizer = prepare_tfidf_vectorizer('russian_news_corpus/top100k.txt') vacancy = get_text('test_vacancy.txt') vacancy = preprocess_text(vacancy) cv = get_text('test_cv.txt') cv = preprocess_text(cv) print(get_keywords(vacancy, hh_vectorizer, 0.15)) print(get_keywords(vacancy, news_vectorizer, 0.10)) print(get_keywords(cv, hh_vectorizer, 0.15)) print(get_keywords(cv, news_vectorizer, 0.10))
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True, stopwords=True): config = f.WebConfig() length = config['concordance_length'] within_x_words = q['word_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 try: word = line.split()[0] except IndexError: continue filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 for hit in results[q['interval_start']:q['interval_end']]: bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return all_collocates, left_collocates, right_collocates else: return all_collocates
def adjust_results(hits, path, q, length=2000): front_of_clause = 35 end_of_clause = 90 word = q['q'] punctuation = re.compile('([,|?|;|.|:|!])') new_results = [] full_report = {} for hit in hits: bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) hit.concordance = theme_rheme_concordance(conc_text, bytes) conc_start = conc_text[:bytes[0]] clause_start = punctuation.split(conc_start)[-1] # keep only last bit conc_end = conc_text[bytes[0]:] clause_end = punctuation.split(conc_end)[0] # keep only first bit clause = f.format.clean_text(clause_start + clause_end) new_clause = [i for i in clause.split() if len(i) > 2 or i.lower() == word] if len(new_clause) < 3: continue word_position = 0 for pos, w in enumerate(new_clause): if w.lower() == word: word_position = pos + 1 break clause_len = len(new_clause) percentage = round(word_position / clause_len * 100, 2) if q['theme_rheme'] == 'front' and percentage <= front_of_clause: hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) hit.position = 'Front' new_results.append(hit) elif q['theme_rheme'] == 'end' and percentage >= end_of_clause: hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) hit.position = 'End' new_results.append(hit) elif q['theme_rheme'] == 'front_end': if percentage <= front_of_clause: hit.position = 'Front' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif percentage >= end_of_clause: hit.position = 'End' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif q['theme_rheme'] == 'front_middle_end': if percentage <= front_of_clause: hit.position = 'Front' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif front_of_clause < percentage < end_of_clause: hit.position = 'Middle' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif percentage >= end_of_clause: hit.position = 'End' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif q['theme_rheme'] == 'full': if percentage <= front_of_clause: hit.position = 'Front' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) if 'Front' not in full_report: full_report['Front'] = 0 full_report['Front'] += 1 elif front_of_clause < percentage < end_of_clause: hit.position = 'Middle' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) if 'Middle' not in full_report: full_report['Middle'] = 0 full_report['Middle'] += 1 elif percentage >= end_of_clause: hit.position = 'End' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) if 'End' not in full_report: full_report['End'] = 0 full_report['End'] += 1 return theme_rheme_hitlist(new_results), full_report
from functions import write_doc, get_text import PySimpleGUI as gui print(gui.version) if __name__ == "__main__": doc_file = gui.PopupGetFile(title='Find document', message='Choose document path') # If the user chose a document and didn't just hit 'ok' this will then ask the user to enter the title of the docx # file where the bold terms will be copied to. if doc_file != "": file_name = gui.PopupGetText( 'Enter the name of the file for bold terms to be copied to') ok_message = gui.Popup(custom_text='Ok', button_type=gui.Ok()) else: error_message = gui.Popup(title="Error", custom_text='No Path Chosen', button_type=gui.Ok()) text_list = get_text(doc_file) write_doc(text_list, file_name)
def adjust_results(hits, path, q, length=2000): front_of_clause = 35 end_of_clause = 90 word = q['q'] punctuation = re.compile('([,|?|;|.|:|!])') new_results = [] full_report = {} for hit in hits: bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) hit.concordance = theme_rheme_concordance(conc_text, bytes) conc_start = conc_text[:bytes[0]] clause_start = punctuation.split(conc_start)[-1] # keep only last bit conc_end = conc_text[bytes[0]:] clause_end = punctuation.split(conc_end)[0] # keep only first bit clause = f.format.clean_text(clause_start + clause_end) new_clause = [ i for i in clause.split() if len(i) > 2 or i.lower() == word ] if len(new_clause) < 3: continue word_position = 0 for pos, w in enumerate(new_clause): if w.lower() == word: word_position = pos + 1 break clause_len = len(new_clause) percentage = round(word_position / clause_len * 100, 2) if q['theme_rheme'] == 'front' and percentage <= front_of_clause: hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) hit.position = 'Front' new_results.append(hit) elif q['theme_rheme'] == 'end' and percentage >= end_of_clause: hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) hit.position = 'End' new_results.append(hit) elif q['theme_rheme'] == 'front_end': if percentage <= front_of_clause: hit.position = 'Front' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif percentage >= end_of_clause: hit.position = 'End' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif q['theme_rheme'] == 'front_middle_end': if percentage <= front_of_clause: hit.position = 'Front' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif front_of_clause < percentage < end_of_clause: hit.position = 'Middle' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif percentage >= end_of_clause: hit.position = 'End' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) elif q['theme_rheme'] == 'full': if percentage <= front_of_clause: hit.position = 'Front' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) if 'Front' not in full_report: full_report['Front'] = 0 full_report['Front'] += 1 elif front_of_clause < percentage < end_of_clause: hit.position = 'Middle' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) if 'Middle' not in full_report: full_report['Middle'] = 0 full_report['Middle'] += 1 elif percentage >= end_of_clause: hit.position = 'End' hit.percentage = str(percentage) + '%' hit.score = str(word_position) + '/' + str(clause_len) new_results.append(hit) if 'End' not in full_report: full_report['End'] = 0 full_report['End'] += 1 return theme_rheme_hitlist(new_results), full_report
sock_2.listen(1) conn_2, addr_2 = sock_2.accept() length_hamming_word_input = functions.bin_dec(list(conn_2.recv(1024))) conn_2.close() # Принимаем значение кодировки исходного текста sock_3 = socket.socket() #sock_3.bind(('192.168.0.103', 49101)) sock_3.bind(('127.0.0.1', 49101)) sock_3.listen(1) conn_3, adrr_3 = sock_3.accept() encoding = functions.get_ascii_code((conn_3.recv(1024)), 16) encoding = functions.get_text(encoding, 16, 'utf8') conn_3.close() # Принимаем значение длины слова для кодировки sock_4 = socket.socket() #sock_4.bind(('192.168.0.103', 49102)) sock_4.bind(('127.0.0.1', 49102)) sock_4.listen(1) conn_4, adrr_4 = sock_4.accept() word_length = functions.bin_dec(list(conn_4.recv(1024))) # Часть 2. Выводим полученные данные на экран print("Hamming_mas = ") print(Hamming_mas)
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True, stopwords=True): config = f.WebConfig() length = config['concordance_length'] within_x_words = q['word_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 try: word = line.split()[0] except IndexError: continue filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 for hit in results[q['interval_start']:q['interval_end']]: bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode( 'utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode( 'utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return all_collocates, left_collocates, right_collocates else: return all_collocates
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def prepare_tfidf_vectorizer(corpus_file): corpus = get_text(corpus_file).splitlines() tf_idf = TfidfVectorizer(ngram_range=(1, 1), stop_words=load_stopwords()) tf_idf.fit_transform(corpus) return tf_idf
from functions import get_text, preprocess_text from summa import keywords vacancy = preprocess_text(get_text('test_vacancy.txt')) cv = preprocess_text(get_text('test_cv.txt')) print(keywords.keywords(vacancy, language="russian", scores=True)) print('====') print(keywords.keywords(cv, language="russian"))
def fetch_collocation(results, path, q, filter_words=100): within_x_words = q['word_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break ## start going though hits ## left_collocates = {} right_collocates = {} all_collocates = {} count = 0 for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) left_words = tokenize(conc_left, filter_list, within_x_words, 'left') if not sum([len(i) for i in left_words]): count += 1 right_words = tokenize(conc_right, filter_list, within_x_words, 'right') for l_word in left_words: if l_word == q['q']: continue if l_word not in left_collocates: left_collocates[l_word] = 0 left_collocates[l_word] += 1 if l_word not in all_collocates: all_collocates[l_word] = 0 all_collocates[l_word] += 1 for r_word in right_words: if r_word == q['q']: continue if r_word not in right_collocates: right_collocates[r_word] = 0 if r_word not in all_collocates: all_collocates[r_word] = 0 right_collocates[r_word] += 1 all_collocates[r_word] += 1 left_out = sorted(left_collocates.items(), key=lambda x: x[1], reverse=True) right_out = sorted(right_collocates.items(), key=lambda x: x[1], reverse=True) all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True) tuple_out = zip(all_out, left_out, right_out) print >> sys.stderr, "COUNT", count return tuple_out
0] is "<" else area_of_expertise # Getting active department frequencies if department not in department_frequency.keys(): department_frequency[department] = 1 else: department_frequency[department] += 1 # Adding areas of expertise frequencies if area_of_expertise not in areas_of_expertise_frequency.keys(): areas_of_expertise_frequency[area_of_expertise] = 1 else: areas_of_expertise_frequency[area_of_expertise] += 1 # Read contents of the job page page_text = get_text(html_link) terms_in_link = NgramBuilder.ngramExtractor( NgramBuilder, page_text) no_of_keywords_in_link = len(terms_in_link) list_of_terms = list(terms_in_link) for j in range(0, no_of_keywords_in_link): if (list_of_terms[j][0] not in terms.keys()): terms[list_of_terms[j][0]] = 1 else: terms[list_of_terms[j][0]] += 1 required_terms = [ "information", "sharepoint", "drupal", "wordpress", "automation", "junior", "senior", "information technology", "ict", "agile", "scrum", "trello", "jira", "java", "python", "c", "golang", "cobol", "manager", "developer", "baseline", "security", "citizen", "citizenship", "police verification",
# Loops until the user decides to quit while True: try: if first_time: action = '0' first_time = False else: # Gives the user a menu of actions to choose from action = functions.get_action() # Reloads the functions file if it's been updated mod_time = functions.update_functions(functions, mod_time) # Executes the appropriate feature depending on the action choice if action == '0': # Lets the user choose a new text file text, filename = functions.get_text() elif action == '1': # Counts how many times a word or phrase appears word = functions.get_word("Please enter a word or phrase to " "find in the text: ") functions.print_frequency(text, word) elif action == '2': # Prints all instances of a word or phrase word = functions.get_word("Please enter a word or phrase to " "find in the text: ") functions.print_instances(text, word) elif action == '3': # Replaces a word or phrase with another word or phrase old_word = functions.get_word("Please enter a word or phrase to " "find in the text: ") new_word = functions.get_word("Please enter a word or phrase to "
from functions import get_text, preprocess_text text = get_text('skills100k.txt') clear_text = preprocess_text(text, True) with open("lemmatized_skills100k.txt", "w") as outfile: outfile.write(clear_text) outfile.close()
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode( 'utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode( 'utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction == 'left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend( tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h