def fetch_concordance(hit, path, q): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = 1000 + byte_distance + 1000 bytes, byte_start = f.format.adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) start_highlight = conc_text.find('<span class="highlight"') m = re.search(r'<span class="highlight">[^<]*(</span>)',conc_text) if m: end_highlight = m.end(len(bytes) - 1) count = 0 for char in reversed(conc_text[:start_highlight]): count += 1 if count > 200 and char == ' ': break begin = start_highlight - count end = 0 for char in conc_text[end_highlight:]: end += 1 if end > 200 and char == ' ': break end += end_highlight first_span = '<span class="begin_concordance" style="display:none;">' second_span = '<span class="end_concordance" style="display:none;">' conc_text = first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>' return conc_text
def fetch_concordance(hit, path, context_size): ## Determine length of text needed byte_distance = hit.bytes[-1] - hit.bytes[0] length = context_size + byte_distance + context_size bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_concordance(conc_text, bytes) conc_text = convert_entities(conc_text) conc_text = strip_start_punctuation.sub("", conc_text) return conc_text
def fetch_concordance(db, hit, path, context_size): ## Determine length of text needed bytes = sorted(hit.bytes) byte_distance = bytes[-1] - bytes[0] length = context_size + byte_distance + context_size bytes, byte_start = adjust_bytes(bytes, context_size) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_concordance(conc_text, db.locals['word_regex'], bytes) conc_text = convert_entities(conc_text) conc_text = strip_start_punctuation.sub("", conc_text) return conc_text
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True): within_x_words = q['word_num'] ## set up filtering of most frequent 200 terms ## filter_list = set([]) if word_filter: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 if not full_report: q['colloc_start'] = None q['colloc_end'] = None for hit in results[q['colloc_start']:q['colloc_end']]: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return dict(all_collocates), dict(left_collocates), dict(right_collocates) else: return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_colloc_concordance(results, path, q, db, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering of most frequent 200 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def KWIC_formatter(output, hit_num, chars=40): output = output.replace("\n", " ") output = output.replace("\r", "") output = output.replace("\t", " ") output = re.sub(" {2,}", " ", output) output = convert_entities(output) start_hit = output.index('<span class="highlight">') end_hit = output.rindex("</span>") + 7 tag_length = 7 * hit_num ## Dont know why I need to run the converter twice... start_output = output[start_hit - chars : start_hit] if len(start_output) < chars: white_space = " " * (chars - len(start_output)) start_output = white_space + start_output start_output = '<span style="white-space:pre-wrap;">' + start_output + "</span>" end_output = output[end_hit:] match = output[start_hit:end_hit] return start_output + match + end_output[: chars + tag_length]
def KWIC_formatter(output, hit_num, chars=40): output = output.replace('\n', ' ') output = output.replace('\r', '') output = output.replace('\t', ' ') output = re.sub(' {2,}', ' ', output) output = convert_entities(output) start_hit = output.index('<span class="highlight">') end_hit = output.rindex('</span>') + 7 tag_length = 7 * hit_num start_output = output[start_hit - chars:start_hit] start_output = re.sub('^[^ ]+? ', ' ', start_output, 1) # Do we want to keep this? if len(start_output) < chars: white_space = ' ' * (chars - len(start_output)) start_output = white_space + start_output start_output = '<span style="white-space:pre-wrap;">' + start_output + '</span>' end_output = re.sub('[^ ]+\Z', ' ', output[end_hit:], 1) match = output[start_hit:end_hit] return start_output + match + end_output[:chars+tag_length]
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] hit_num = len(hit.bytes) if hit_num < samples: byte_sample = sorted(sample(hit.bytes, hit_num)) else: byte_sample = sorted(sample(hit.bytes, samples)) if hit_num and hit_num < samples: length = int(length * samples / hit_num) for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] hit_num = len(hit.bytes) if hit_num < samples: byte_sample = sorted(sample(hit.bytes, hit_num)) else: byte_sample = sorted(sample(hit.bytes, samples)) if hit_num and hit_num < samples: length = int(length * samples / hit_num) for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = re.sub('<(/?span.*?)>', '[\\1]', conc_text) #conc_text = re.sub('<.*?>', '', conc_text) #conc_text = re.sub('\[(/?span.*?)\]', '<\\1>', conc_text) #conc_text = re.sub('<div[^>]*>', '', conc_text) #conc_text = re.sub('</div>', '', conc_text) text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode( 'utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode( 'utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction == 'left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend( tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True, stopwords=True): config = f.WebConfig() length = config['concordance_length'] within_x_words = q['word_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 try: word = line.split()[0] except IndexError: continue filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 for hit in results[q['interval_start']:q['interval_end']]: bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode( 'utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode( 'utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return all_collocates, left_collocates, right_collocates else: return all_collocates
def format_text_object(obj, text, config, q, word_regex, bytes=[], note=False): philo_id = obj.philo_id if bytes: new_text = "" last_offset = 0 for b in bytes: new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] first_img = '' current_obj_img = [] text = "<div>" + text + "</div>" xml = f.FragmentParser.parse(text) for el in xml.iter(): try: if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" elif el.tag == "list": el.tag = "ul" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' elif el.tag == "ptr" or el.tag == "ref": target = el.attrib["target"] link = f.link.make_absolute_query_link(config, q, script_name="/scripts/get_notes.py", target=target) el.attrib["data-ref"] = link el.attrib["id"] = target.replace('#', '') + '-link-back' del el.attrib["target"] el.attrib['class'] = "note-ref" el.attrib['tabindex'] = "0" el.attrib['data-toggle'] = "popover" el.attrib['data-container'] = "body" el.attrib["data-placement"] = "right" el.attrib["data-trigger"] = "focus" el.attrib["data-html"] = "true" el.attrib["data-animation"] = "true" el.text = "note" el.tag = "span" elif el.tag == "note": if el.getparent().attrib["type"] != "notes": ## inline notes el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = note_content(child) # insert an anchor before this element by scanning through the parent parent = el.getparent() for i,child in enumerate(parent): if child == el: attribs = {"class":"note", "tabindex": "0", "data-toggle": "popover", "data-container": "body", "data-placement": "right", "data-trigger": "focus"} parent.insert(i,etree.Element("a",attrib=attribs)) new_anchor = parent[i] new_anchor.text = "note" else: # endnotes el.tag = "div" el.attrib['class'] = "xml-note" note_id = '#' + el.attrib['id'] link_back = etree.Element("a") link_back.attrib['note-link-back'] = f.link.make_absolute_query_link(config, q, script_name="/scripts/get_note_link_back.py", doc_id=str(philo_id[0]), note_id=note_id) link_back.attrib['class'] = "btn btn-xs btn-default link-back" link_back.attrib['role'] = "button" link_back.text = "Go back to text" el.append(link_back) elif el.tag == "item": el.tag = "li" elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "pb" and "n" in el.attrib: if "fac" in el.attrib or "id" in el.attrib: if "fac" in el.attrib: img = el.attrib["fac"] else: img = el.attrib["id"] current_obj_img.append(img) el.tag = "p" el.append(etree.Element("a")) el[-1].attrib["href"] = config.page_images_url_root + '/' + img el[-1].text = "[page " + el.attrib["n"] + "]" el[-1].attrib['class'] = "page-image-link" el[-1].attrib['data-gallery'] = '' elif el.tag == "figure": if el[0].tag == "graphic": img_url = el[0].attrib["url"].replace(":","_") volume = re.match("\d+", img_url).group() url_prefix = config.page_images_url_root + '/V' + volume + "/plate_" el.tag = "span" el.attrib["href"] = url_prefix + img_url + ".jpeg" el[0].tag = "img" el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg" el[0].attrib["class"] = "inline-img" el.attrib["class"] = "inline-img-container" del el[0].attrib["url"] clear_float = etree.Element("span") clear_float.attrib['style'] = 'clear:both;' el[0].append(clear_float) elif el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in valid_html_tags: el = xml_to_html_class(el) except: pass output = etree.tostring(xml) ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) output = convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8') if note: ## Notes don't need to fetch images return (output, {}) ## Page images output, img_obj = page_images(config, output, current_obj_img, philo_id) return output, img_obj
def format_text_object(text, config, q, word_regex, bytes=[]): if bytes: new_text = "" last_offset = 0 for b in bytes: new_text += text[last_offset:b] + "<philoHighlight/>" last_offset = b text = new_text + text[last_offset:] text = "<div>" + text + "</div>" xml = f.FragmentParser.parse(text) for el in xml.iter(): try: if el.tag == "sc" or el.tag == "scx": el.tag = "span" el.attrib["class"] = "small-caps" elif el.tag == "head": el.tag = "b" el.attrib["class"] = "headword" el.append(etree.Element("br")) elif el.tag == "list": el.tag = "ul" elif el.tag == "title": el.tag = "span" el.attrib['class'] = "xml-title" elif el.tag == "q": el.tag = "span" el.attrib['class'] = 'xml-q' elif el.tag == "ptr" or el.tag == "ref": target = el.attrib["target"] link = f.link.make_absolute_query_link(config, q, script_name="/scripts/get_notes.py", target=target) el.attrib["data-ref"] = link del el.attrib["target"] el.attrib['class'] = "note-ref" el.attrib['tabindex'] = "0" el.attrib['data-toggle'] = "popover" el.attrib['data-container'] = "body" el.attrib["data-placement"] = "right" el.attrib["data-trigger"] = "focus" el.attrib["data-html"] = "true" el.attrib["data-animation"] = "true" el.text = "note" el.tag = "span" elif el.tag == "note" and el.getparent().attrib["type"] != "notes": el.tag = 'span' el.attrib['class'] = "note-content" for child in el: child = note_content(child) # insert an anchor before this element by scanning through the parent parent = el.getparent() for i,child in enumerate(parent): if child == el: attribs = {"class":"note", "tabindex": "0", "data-toggle": "popover", "data-container": "body", "data-placement": "right", "data-trigger": "focus"} parent.insert(i,etree.Element("a",attrib=attribs)) new_anchor = parent[i] new_anchor.text = "note" elif el.tag == "item": el.tag = "li" elif el.tag == "ab" or el.tag == "ln": el.tag = "l" elif el.tag == "pb" and "fac" in el.attrib and "n" in el.attrib: el.tag = "p" el.append(etree.Element("a")) el[-1].attrib["href"] = 'http://artflx.uchicago.edu/images/encyclopedie/' + el.attrib["fac"] el[-1].text = "[page " + el.attrib["n"] + "]" el[-1].attrib['class'] = "page-image-link" el[-1].attrib['data-gallery'] = '' elif el.tag == "figure": if el[0].tag == "graphic": img_url = el[0].attrib["url"].replace(":","_") volume = re.match("\d+",img_url).group() url_prefix = "http://artflx.uchicago.edu/images/encyclopedie/V" + volume + "/plate_" el.tag = "a" el.attrib["href"] = url_prefix + img_url + ".jpeg" el[0].tag = "img" el[0].attrib["src"] = url_prefix + img_url + ".sm.jpeg" el[0].attrib["class"] = "plate_img" el.attrib["class"] = "plate-image-link" el.attrib['data-gallery'] = '' del el[0].attrib["url"] el.append(etree.Element("br")) elif el.tag == "philoHighlight": word_match = re.match(word_regex, el.tail, re.U) if word_match: el.text = el.tail[:word_match.end()] el.tail = el.tail[word_match.end():] el.tag = "span" el.attrib["class"] = "highlight" if el.tag not in valid_html_tags: el = xml_to_html_class(el) except: pass output = etree.tostring(xml) ## remove spaces around hyphens and apostrophes output = re.sub(r" ?([-';.])+ ", '\\1 ', output) return convert_entities(output.decode('utf-8', 'ignore')).encode('utf-8')
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True, stopwords=True): config = f.WebConfig() length = config['concordance_length'] within_x_words = q['word_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 try: word = line.split()[0] except IndexError: continue filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 for hit in results[q['interval_start']:q['interval_end']]: bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db) right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db) for l_word in left_words: left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return all_collocates, left_collocates, right_collocates else: return all_collocates