def fetch_concordance(hit, path, q):
    ## Determine length of text needed
    byte_distance = hit.bytes[-1] - hit.bytes[0]
    length = 1000 + byte_distance + 1000
    
    bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_text = format_strip(conc_text, bytes)
    conc_text = convert_entities(conc_text)
    start_highlight = conc_text.find('<span class="highlight"')
    m = re.search(r'<span class="highlight">[^<]*(</span>)',conc_text)
    if m:
        end_highlight = m.end(len(bytes) - 1)
        count = 0
        for char in reversed(conc_text[:start_highlight]):
            count += 1
            if count > 200 and char == ' ':
                break
        begin = start_highlight - count
        end = 0
        for char in conc_text[end_highlight:]:
            end += 1
            if end > 200 and char == ' ':
                break
        end += end_highlight
        first_span = '<span class="begin_concordance" style="display:none;">'
        second_span = '<span class="end_concordance" style="display:none;">'
        conc_text =  first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>'
    return conc_text
def fetch_relevance(hit, path, q, kwic=True, samples=3):
    length = 400
    text_snippet = []
    if len(hit.bytes) >= samples:
        byte_sample = sample(hit.bytes, samples)
    else:
        byte_sample = hit.bytes
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic)
        conc_start = clean_text(conc_start, kwic=kwic)
        conc_end = clean_text(conc_end, kwic=kwic)
        if kwic:
            conc_middle = clean_text(conc_middle, notag=False, kwic=kwic)
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
            conc_text = align_text(conc_text, 1)
        else:
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    if kwic:
        text = '<br>\n'.join(text_snippet)
    else:
        text = '... '.join(text_snippet)
    return text
    
     
def fetch_relevance(hit, path, q, kwic=True, samples=3):
    length = 400
    text_snippet = []
    if len(hit.bytes) >= samples:
        byte_sample = sample(hit.bytes, samples)
    else:
        byte_sample = hit.bytes
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic)
        conc_start = clean_text(conc_start, kwic=kwic)
        conc_end = clean_text(conc_end, kwic=kwic)
        if kwic:
            conc_middle = clean_text(conc_middle, notag=False, kwic=kwic)
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
            conc_text = align_text(conc_text, 1)
        else:
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    if kwic:
        text = '<br>\n'.join(text_snippet)
    else:
        text = '... '.join(text_snippet)
    return text
def fetch_collocation(results, path, q, filter_words=100):
    within_x_words = q['word_num']    
    
    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    ## start going though hits ##
    left_collocates = {}
    right_collocates = {}
    all_collocates = {}
    
    count = 0
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        
        left_words = tokenize(conc_left, filter_list, within_x_words, 'left')
        right_words = tokenize(conc_right, filter_list, within_x_words, 'right')
    
        for l_word in left_words:
            if l_word == q['q']:
                continue
            if l_word not in left_collocates:
                left_collocates[l_word] = 0
            left_collocates[l_word] += 1
            if l_word not in all_collocates:
                all_collocates[l_word] = 0
            all_collocates[l_word] += 1 

        for r_word in right_words:
            if r_word == q['q']:
                continue
            if r_word not in right_collocates:
                right_collocates[r_word] = 0
            if r_word not in all_collocates:
                all_collocates[r_word] = 0
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1

    left_out = sorted(left_collocates.items(), key=lambda x: x[1], reverse=True)[:100]
    right_out = sorted(right_collocates.items(), key=lambda x: x[1], reverse=True)[:100]
    all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)[:100]

    tuple_out = zip(all_out, left_out, right_out)
    return tuple_out
Example #5
0
def fetch_concordance(hit, path, context_size):
    ## Determine length of text needed
    byte_distance = hit.bytes[-1] - hit.bytes[0]
    length = context_size + byte_distance + context_size
    bytes, byte_start = adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_text = format_concordance(conc_text, bytes)
    conc_text = convert_entities(conc_text)
    conc_text = strip_start_punctuation.sub("", conc_text)
    return conc_text
Example #6
0
def fetch_concordance(hit, path, context_size):
    ## Determine length of text needed
    byte_distance = hit.bytes[-1] - hit.bytes[0]
    length = context_size + byte_distance + context_size
    bytes, byte_start = adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_text = format_concordance(conc_text, bytes)
    conc_text = convert_entities(conc_text)
    conc_text = strip_start_punctuation.sub("", conc_text)
    return conc_text
Example #7
0
def fetch_concordance(db, hit, path, context_size):
    ## Determine length of text needed
    bytes = sorted(hit.bytes)
    byte_distance = bytes[-1] - bytes[0]
    length = context_size + byte_distance + context_size
    bytes, byte_start = adjust_bytes(bytes, context_size)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_text = format_concordance(conc_text, db.locals['word_regex'], bytes)
    conc_text = convert_entities(conc_text)
    conc_text = strip_start_punctuation.sub("", conc_text)
    return conc_text
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True):
    within_x_words = q['word_num']    
    
    ## set up filtering of most frequent 200 terms ##
    filter_list = set([])
    if word_filter:
        filter_list_path = path + '/data/frequencies/word_frequencies'
        filter_words_file = open(filter_list_path)
        line_count = 0 
        for line in filter_words_file:
            line_count += 1
            word = line.split()[0]
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                break
    
    ## start going though hits ##
    left_collocates = defaultdict(int)
    right_collocates = defaultdict(int)
    all_collocates = defaultdict(int)
    
    count = 0
    if not full_report:
        q['colloc_start'] = None
        q['colloc_end'] = None
    for hit in results[q['colloc_start']:q['colloc_end']]:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        conc_text = unicodedata.normalize('NFC', conc_text)
        start_highlight = conc_text.find('<span class="highlight"')
        m = end_highlight_match.search(conc_text)
        end_highlight = m.end(len(m.groups()) - 1)
        conc_left = conc_text[:start_highlight]
        conc_right = conc_text[end_highlight:]
        
        left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db)
        right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db)
        
        for l_word in left_words:
            left_collocates[l_word] += 1
            all_collocates[l_word] += 1 

        for r_word in right_words:
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1    

    if full_report:
        return dict(all_collocates), dict(left_collocates), dict(right_collocates)
    else:
        return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_collocation(results, path, q, word_filter=True, filter_num=200, full_report=True):
    within_x_words = q['word_num']    
    
    ## set up filtering of most frequent 200 terms ##
    filter_list = set([])
    if word_filter:
        filter_list_path = path + '/data/frequencies/word_frequencies'
        filter_words_file = open(filter_list_path)
        line_count = 0 
        for line in filter_words_file:
            line_count += 1
            word = line.split()[0]
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                    break
    
    ## start going though hits ##
    left_collocates = defaultdict(int)
    right_collocates = defaultdict(int)
    all_collocates = defaultdict(int)
    
    count = 0
    if not full_report:
        q['colloc_start'] = None
        q['colloc_end'] = None
    for hit in results[q['colloc_start']:q['colloc_end']]:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        
        left_words = tokenize(conc_left, filter_list, within_x_words, 'left')
        right_words = tokenize(conc_right, filter_list, within_x_words, 'right')
        
        query_words = set([w.decode('utf-8') for w in q['q'].split('|')])
        
        for l_word in left_words:
            if l_word in query_words:
                continue
            left_collocates[l_word] += 1
            all_collocates[l_word] += 1 

        for r_word in right_words:
            if r_word in query_words:
                continue
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1    

    if full_report:
        return dict(all_collocates), dict(left_collocates), dict(right_collocates)
    else:
        return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_colloc_concordance(results, path, q, db, filter_words=100):
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore'))
    collocate_num = q['collocate_num']
    
    ## set up filtering of most frequent 200 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        #conc_text = unicodedata.normalize('NFC', conc_text)
        start_highlight = conc_text.find('<span class="highlight"')
        m = end_highlight_match.search(conc_text)
        end_highlight = m.end(len(m.groups()) - 1)
        conc_left = conc_text[:start_highlight]
        conc_right = conc_text[end_highlight:]
        if direction =='left':
            words = tokenize(conc_left, filter_list, within_x_words, direction, db)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words, direction, db)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left', db)
            words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)

        if len(new_hitlist) > (q["start"] + q["results_per_page"]):
            break
    
    h = collocation_hitlist(new_hitlist, collocate_num)
    return h
Example #11
0
def fetch_concordance(hit, path, q, length=2000):
    bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True)
    conc_start = f.format.clean_text(conc_start)
    conc_end = f.format.clean_text(conc_end)
    conc_text = conc_start + conc_middle + conc_end
    conc_text = conc_text.decode('utf-8', 'ignore')
    highlight_index = conc_text.find('<span class="highlight"')
    begin = highlight_index - 200 ## make sure the highlighted term does not get hidden
    end = highlight_index + 200
    first_span = '<span class="begin_concordance" style="display:none;">'
    second_span = '<span class="end_concordance" style="display:none;">'
    conc_text =  first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>'
    return conc_text
def fetch_concordance(hit, path, q, length=2000):
    bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True)
    conc_start = f.format.clean_text(conc_start)
    conc_end = f.format.clean_text(conc_end)
    conc_text = conc_start + conc_middle + conc_end
    conc_text = conc_text.decode('utf-8', 'ignore')
    highlight_index = conc_text.find('<span class="highlight"')
    begin = highlight_index - 200 ## make sure the highlighted term does not get hidden
    end = highlight_index + 200
    first_span = '<span class="begin_concordance" style="display:none;">'
    second_span = '<span class="end_concordance" style="display:none;">'
    conc_text =  first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>'
    return conc_text
Example #13
0
def fetch_relevance(hit, path, q, samples=10):
    length = 75
    text_snippet = []
    byte_sample = hit.bytes[:samples]
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True)
        conc_start = clean_text(conc_start)
        conc_end = clean_text(conc_end)
        conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    text = ' ... '.join(text_snippet)
    return text
Example #14
0
def fetch_kwic(results, path, q, byte_query, start, end, length=400):
    kwic_results = []
    shortest_biblio = 0

    for hit in results[start:end]:
        biblio = hit.articleAuthor + ", " + hit.head

        ## additional clean-up for titles
        biblio = " ".join(biblio.split())  ## maybe hackish, but it works

        get_query = byte_query(hit.bytes)
        href = "./" + "/".join([str(i) for i in hit.philo_id[:4]]) + get_query

        ## Find shortest bibliography entry
        biblio = biblio
        if shortest_biblio == 0:
            shortest_biblio = len(biblio)
        if len(biblio) < shortest_biblio:
            shortest_biblio = len(biblio)

        ## Determine length of text needed
        byte_distance = hit.bytes[-1] - hit.bytes[0]
        length = 200 + byte_distance + 200

        ## Get concordance and align it
        bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = KWIC_formatter(conc_text, len(hit.bytes))
        kwic_results.append((biblio, href, conc_text, hit))

    if shortest_biblio < 20:
        shortest_biblio = 20

    ## Populate Kwic_results with bibliography
    for pos, result in enumerate(kwic_results):
        biblio, href, text, hit = result
        if len(biblio) < 20:
            diff = 20 - len(biblio)
            biblio += " " * diff
        short_biblio = '<span id="short_biblio" style="white-space:pre-wrap;">%s</span>' % biblio[:shortest_biblio]
        full_biblio = '<span id="full_biblio" style="display:none;">%s</span>' % biblio
        kwic_biblio = full_biblio + short_biblio
        kwic_biblio_link = (
            '<a href="%s" class="kwic_biblio" style="white-space:pre-wrap;">' % href + kwic_biblio + "</a>: "
        )
        kwic_results[pos] = kwic_biblio_link + '<span id="kwic_text">%s</span>' % text
    return kwic_results
Example #15
0
def fetch_kwic(results, path, q, byte_query, start, end, length=400):
    kwic_results = []
    shortest_biblio = 0
    for hit in results[start:end]:
        biblio = hit.author + ', ' + hit.title

        ## additional clean-up for titles
        biblio = ' '.join(biblio.split())  ## maybe hackish, but it works

        get_query = byte_query(hit.bytes)
        href = "./" + '/'.join([str(i) for i in hit.philo_id[:5]]) + get_query

        ## Find shortest bibliography entry
        biblio = biblio
        if shortest_biblio == 0:
            shortest_biblio = len(biblio)
        if len(biblio) < shortest_biblio:
            shortest_biblio = len(biblio)

        ## Get concordance and align it
        bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text,
                                                                bytes,
                                                                highlight=True,
                                                                kwic=True)
        conc_start = f.format.clean_text(conc_start, kwic=True)
        conc_end = f.format.clean_text(conc_end, kwic=True)
        conc_middle = f.format.clean_text(conc_middle, notag=False, kwic=True)
        conc_text = (conc_start + conc_middle + conc_end).decode(
            'utf-8', 'ignore')
        conc_text = f.format.align_text(conc_text, len(hit.bytes))
        kwic_results.append((biblio, href, conc_text, hit))

    ## Populate Kwic_results with bibliography
    for pos, result in enumerate(kwic_results):
        biblio, href, text, hit = result
        short_biblio = '<span id="short_biblio">%s</span>' % biblio[:
                                                                    shortest_biblio]
        end_biblio = '<span id="end_biblio" style="display:none;">%s</span>' % biblio[
            shortest_biblio:]
        full_biblio = short_biblio + end_biblio
        full_biblio = '<a href="%s" id="kwic_biblio" style="white-space:pre-wrap;">' % href + full_biblio + '</a>: '
        kwic_results[pos] = (full_biblio + text, hit)
    return kwic_results
Example #16
0
def fetch_kwic(results, path, q, byte_query, start, end, length=400):
    kwic_results = []
    shortest_biblio = 0

    for hit in results[start:end]:
        biblio = hit.author + ', ' +  hit.title
        
        ## additional clean-up for titles
        biblio = ' '.join(biblio.split()) ## maybe hackish, but it works
        
        get_query = byte_query(hit.bytes)
        href = "./" + '/'.join([str(i) for i in hit.philo_id[:4]]) + get_query
        
        ## Find shortest bibliography entry
        biblio = biblio
        if shortest_biblio == 0:
            shortest_biblio = len(biblio)
        if len(biblio) < shortest_biblio:
            shortest_biblio = len(biblio)
            
        ## Determine length of text needed
        byte_distance = hit.bytes[-1] - hit.bytes[0]
        length = 200 + byte_distance + 200
            
        ## Get concordance and align it
        bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True, kwic=True)
        conc_start = f.format.clean_text(conc_start, kwic=True)
        conc_end = f.format.clean_text(conc_end, kwic=True)
        conc_middle = f.format.clean_text(conc_middle, notag=False, kwic=True)
        conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        conc_text = f.format.align_text(conc_text, len(hit.bytes))
        kwic_results.append((biblio, href, conc_text, hit))
    
    ## Populate Kwic_results with bibliography    
    for pos, result in enumerate(kwic_results):
        biblio, href, text, hit = result
        short_biblio = '<span id="short_biblio">%s</span>' % biblio[:shortest_biblio]
        full_biblio = '<span id="full_biblio" style="display:none;">%s</span>' % biblio
        kwic_biblio = full_biblio + short_biblio
        kwic_biblio_link = '<a href="%s" class="kwic_biblio" style="white-space:pre-wrap;">' % href + kwic_biblio + '</a>: '
        kwic_results[pos] = kwic_biblio_link + '<span id="kwic_text">%s</span>' % text
    return kwic_results
def fetch_colloc_concordance(results, path, q, filter_words=100):
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = q['collocate'].decode('utf-8', 'ignore')
    collocate_num = q['collocate_num']
    
    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        if direction =='left':
            words = tokenize(conc_left, filter_list, within_x_words, direction)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words, direction)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left')
            words.extend(tokenize(conc_right, filter_list, within_x_words, 'right'))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)

        if len(new_hitlist) > (q["start"] + q["results_per_page"]):
            break
            
    
    return collocation_hitlist(new_hitlist, collocate_num)
Example #18
0
def fetch_relevance(hit, path, q, samples=10):
    length = 75
    text_snippet = []
    hit_num = len(hit.bytes)
    if hit_num < samples:
        byte_sample = sorted(sample(hit.bytes, hit_num))
    else:
        byte_sample = sorted(sample(hit.bytes, samples))
    if hit_num and hit_num < samples:
        length = int(length * samples / hit_num)
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        text_snippet.append(conc_text)
    text = ' ... '.join(text_snippet)
    return text
Example #19
0
def fetch_kwic(results, path, q, byte_query, db, start, end, length=5000):
    kwic_results = []
    
    default_short_citation_len = 30
    short_citation_len = 0
    for hit in results[start:end]:
        full_citation, short_citation, href = f.kwic_citation(db, hit, default_short_citation_len)
        
        ## Find longest short_citation
        if short_citation_len == 0:
            short_citation_len = len(short_citation)
        elif len(short_citation) > short_citation_len:
            short_citation_len = len(short_citation)
            
        ## Determine length of text needed
        byte_distance = hit.bytes[-1] - hit.bytes[0]
        length = length/2 + byte_distance + length/2
            
        ## Get concordance and align it
        bytes, byte_start = adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = KWIC_formatter(conc_text, len(hit.bytes))
        kwic_results.append((full_citation, short_citation, href, conc_text, hit))
    
    if short_citation_len < default_short_citation_len:
        default_short_citation_len = short_citation_len
    
    ## Populate Kwic_results with bibliography    
    for pos, result in enumerate(kwic_results):
        biblio, short_biblio, href, text, hit = result
        if len(short_biblio) < default_short_citation_len:
            diff = default_short_citation_len - len(short_biblio)
            short_biblio += '&nbsp;' * diff
        short_biblio = '<span class="short_biblio">%s</span>' % short_biblio
        full_biblio = '<span class="full_biblio" style="display:none;">%s</span>' % biblio
        kwic_biblio = full_biblio + short_biblio
        if q['format'] == "json":
            kwic_results[pos] = (kwic_biblio, text, hit.philo_id)
        else:
            kwic_biblio_link = '<a href="%s" class="kwic_biblio">' % href + kwic_biblio + '</a>: '
            kwic_results[pos] = kwic_biblio_link + '%s' % text
    return kwic_results
def fetch_colloc_concordance(results, path, q, filter_words=100):
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = q['collocate']
    collocate_num = q['collocate_num']
    
    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        if direction =='left':
            words = tokenize(conc_left, filter_list, within_x_words, direction)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words, direction)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left')
            words.extend(tokenize(conc_right, filter_list, within_x_words, 'right'))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)
    
    return collocation_hitlist(new_hitlist, collocate_num)
Example #21
0
def generate_kwic_results(db, q, config, link_to_hit="div1"):
    """ The link_to_hit keyword defines the text object to which the metadata link leads to"""
    hits = db.query(q["q"],q["method"],q["arg"],**q.metadata)
    start, end, n = f.link.page_interval(q.results_per_page, hits, q.start, q.end)
    kwic_object = {"description": {"start": start, "end": end, "results_per_page": q.results_per_page},
                    "query": dict([i for i in q])}
    kwic_results = []
    
    length = config.concordance_length
    
    for hit in hits[start - 1:end]:
        # Get all metadata
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata].strip()
        
        ## Get all links and citations
        citation_hrefs = citation_links(db, config, hit)
        citation = concordance_citation(hit, citation_hrefs)
            
        ## Determine length of text needed
        byte_distance = hit.bytes[-1] - hit.bytes[0]
        length = config.concordance_length + byte_distance + config.concordance_length
            
        ## Get concordance and align it
        bytes, byte_start = adjust_bytes(hit.bytes, config.concordance_length)
        conc_text = f.get_text(hit, byte_start, length, config.db_path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = KWIC_formatter(conc_text, len(hit.bytes))

        kwic_result = {"philo_id": hit.philo_id, "context": conc_text, "metadata_fields": metadata_fields,
                       "citation_links": citation_hrefs, "citation": citation, "bytes": hit.bytes}
        kwic_results.append(kwic_result)
    kwic_object['results'] = kwic_results
    kwic_object['results_length'] = len(hits)
    kwic_object["query_done"] = hits.done
    
    return kwic_object
def fetch_concordance(hit, path, q, length=2000):
    bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True)
    conc_start = f.format.clean_text(conc_start)
    conc_end = f.format.clean_text(conc_end)
    conc_text = conc_start + conc_middle + conc_end
    conc_text = conc_text.decode('utf-8', 'ignore')
    highlight_index = conc_text.find('<span class="highlight"')
    begin = highlight_index - 200 ## make sure the highlighted term does not get hidden
    end = highlight_index + 200
    first_span = '<span class="begin_concordance" style="display:none;">'
    second_span = '<span class="end_concordance" style="display:none;">'
    conc_text =  first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>'
    split_text = re.split(r"([^ \.,;:?!\"\n\r\t\(\)]+)|([\.;:?!])", conc_text)
    keep_text = []
    for w in split_text:
        if w:
            if w.lower() == q['collocate']:
                w = '<span class="collocate">%s</span>' % w
            keep_text.append(w)
    conc_text = ''.join(keep_text)
    return conc_text  
def fetch_concordance(hit, path, q):
    ## Determine length of text needed
    byte_distance = hit.bytes[-1] - hit.bytes[0]
    length = 1000 + byte_distance + 1000
    
    bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True)
    conc_start = f.format.clean_text(conc_start)
    conc_end = f.format.clean_text(conc_end)
    conc_text = conc_start + conc_middle + conc_end
    conc_text = conc_text.decode('utf-8', 'ignore')
    start_highlight = conc_text.find('<span class="highlight"')
    end_highlight = conc_text.rfind('</span>')
    begin = start_highlight - 200
    end = end_highlight + 200
    min = bytes[-1] + len("<span class='highlight'></span>") * len(bytes)
    if end < min:
        end = min
    first_span = '<span class="begin_concordance" style="display:none;">'
    second_span = '<span class="end_concordance" style="display:none;">'
    conc_text =  first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>'
    return conc_text
def fetch_concordance(hit, path, q, length=2000):
    bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
    conc_text = f.get_text(hit, byte_start, length, path)
    conc_start, conc_middle, conc_end = f.format.chunkifier(conc_text, bytes, highlight=True)
    conc_start = f.format.clean_text(conc_start)
    conc_end = f.format.clean_text(conc_end)
    conc_text = conc_start + conc_middle + conc_end
    conc_text = conc_text.decode('utf-8', 'ignore')
    highlight_index = conc_text.find('<span class="highlight"')
    begin = highlight_index - 200 ## make sure the highlighted term does not get hidden
    end = highlight_index + 200
    first_span = '<span class="begin_concordance" style="display:none;">'
    second_span = '<span class="end_concordance" style="display:none;">'
    conc_text =  first_span + conc_text[:begin] + '</span>' + conc_text[begin:end] + second_span + conc_text[end:] + '</span>'
    split_text = re.split(r"([^ \.,;:?!\"\n\r\t\(\)]+)|([\.;:?!])", conc_text)
    keep_text = []
    for w in split_text:
        if w:
            if w.lower() == q['collocate'].decode('utf-8', 'ignore'):
                w = '<span class="collocate">%s</span>' % w
            keep_text.append(w)
    conc_text = ''.join(keep_text)
    return conc_text  
def fetch_relevance(hit, path, q, samples=10):
    length = 75
    text_snippet = []
    hit_num = len(hit.bytes)
    if hit_num < samples:
        byte_sample = sorted(sample(hit.bytes, hit_num))
    else:
        byte_sample = sorted(sample(hit.bytes, samples))
    if hit_num and hit_num < samples:
        length = int(length * samples / hit_num)
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_text = format_strip(conc_text, bytes)
        conc_text = convert_entities(conc_text)
        #conc_text = re.sub('<(/?span.*?)>', '[\\1]', conc_text)
        #conc_text = re.sub('<.*?>', '', conc_text)
        #conc_text = re.sub('\[(/?span.*?)\]', '<\\1>', conc_text)
        #conc_text = re.sub('<div[^>]*>', '', conc_text)
        #conc_text = re.sub('</div>', '', conc_text)
        text_snippet.append(conc_text)
    text = ' ... '.join(text_snippet)
    return text
Example #26
0
    tf_idf.fit_transform(corpus)
    return tf_idf


def get_keywords(text, tfidf_vectorizer, score_limit=0.15):
    vector = tfidf_vectorizer.transform([text])
    words = tfidf_vectorizer.get_feature_names()
    tfidf_scores = vector.todense().tolist()[0]

    all_keywords = [(word, tfidf_scores[idx])
                    for idx, word in enumerate(words)]
    all_keywords = sorted(all_keywords, key=lambda x: x[1], reverse=True)
    top_keywords = list(filter(lambda x: x[1] >= score_limit, all_keywords))

    return top_keywords


hh_vectorizer = prepare_tfidf_vectorizer('lemmatized_skills100k.txt')
news_vectorizer = prepare_tfidf_vectorizer('russian_news_corpus/top100k.txt')

vacancy = get_text('test_vacancy.txt')
vacancy = preprocess_text(vacancy)

cv = get_text('test_cv.txt')
cv = preprocess_text(cv)

print(get_keywords(vacancy, hh_vectorizer, 0.15))
print(get_keywords(vacancy, news_vectorizer, 0.10))
print(get_keywords(cv, hh_vectorizer, 0.15))
print(get_keywords(cv, news_vectorizer, 0.10))
Example #27
0
def fetch_collocation(results, path, q, db, word_filter=True, filter_num=100, full_report=True, stopwords=True):
    config = f.WebConfig()
    length = config['concordance_length']
    within_x_words = q['word_num']    
    
    ## set up filtering with stopwords or 100 most frequent terms ##
    filter_list = set([q['q']])
    if word_filter:
        if stopwords:
            filter_list_path = path + '/data/stopwords.txt'
            if os.path.isfile(filter_list_path):
                filter_words_file = open(filter_list_path)
                filter_num = float("inf")
            else:
                filter_list_path = path + '/data/frequencies/word_frequencies'
                filter_words_file = open(filter_list_path)
        else:
            filter_list_path = path + '/data/frequencies/word_frequencies'
            filter_words_file = open(filter_list_path)
        line_count = 0 
        for line in filter_words_file:
            line_count += 1
            try:
                word = line.split()[0]
            except IndexError:
                continue
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                break
    
    ## start going though hits ##
    left_collocates = defaultdict(int)
    right_collocates = defaultdict(int)
    all_collocates = defaultdict(int)
    
    count = 0
    for hit in results[q['interval_start']:q['interval_end']]:
        bytes, byte_start = adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        
        ## Isolate left and right concordances
        conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore'))
        conc_left = begin_match.sub('', conc_left)
        conc_left = start_cutoff_match.sub('', conc_left)
        conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore'))
        conc_right = end_match.sub('', conc_right)
        conc_right = left_truncate.sub('', conc_right)
        conc_left = strip_tags(conc_left)
        conc_right = strip_tags(conc_right)
        
        left_words = tokenize(conc_left, filter_list, within_x_words, 'left', db)
        right_words = tokenize(conc_right, filter_list, within_x_words, 'right', db)
        
        for l_word in left_words:
            left_collocates[l_word] += 1
            all_collocates[l_word] += 1 

        for r_word in right_words:
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1  
    
    if full_report:
        return all_collocates, left_collocates, right_collocates
    else:
        return all_collocates
def adjust_results(hits, path, q, length=2000):
    front_of_clause = 35
    end_of_clause = 90
    word = q['q']
    punctuation = re.compile('([,|?|;|.|:|!])')
    new_results = []
    full_report = {}
    for hit in hits:
        bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        hit.concordance = theme_rheme_concordance(conc_text, bytes)
        conc_start = conc_text[:bytes[0]]
        clause_start = punctuation.split(conc_start)[-1] # keep only last bit
        conc_end = conc_text[bytes[0]:]
        clause_end = punctuation.split(conc_end)[0] # keep only first bit
        clause = f.format.clean_text(clause_start + clause_end)
        new_clause = [i for i in clause.split() if len(i) > 2 or i.lower() == word]
        if len(new_clause) < 3:
            continue
        word_position = 0
        for pos, w in enumerate(new_clause):
            if w.lower() == word:
                word_position = pos + 1
                break
        clause_len = len(new_clause)
        percentage = round(word_position / clause_len * 100, 2)
        if q['theme_rheme'] == 'front' and percentage <= front_of_clause:
            hit.percentage = str(percentage) + '%'
            hit.score = str(word_position) + '/' + str(clause_len)
            hit.position = 'Front'
            new_results.append(hit)
        elif q['theme_rheme'] == 'end' and percentage >= end_of_clause:
            hit.percentage = str(percentage) + '%'
            hit.score = str(word_position) + '/' + str(clause_len)
            hit.position = 'End'
            new_results.append(hit)
        elif q['theme_rheme'] == 'front_end':
            if percentage <= front_of_clause:
                hit.position = 'Front'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
            elif percentage >= end_of_clause:
                hit.position = 'End'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
        elif q['theme_rheme'] == 'front_middle_end':
            if percentage <= front_of_clause:
                hit.position = 'Front'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
            elif front_of_clause < percentage < end_of_clause:
                hit.position = 'Middle'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
            elif percentage >= end_of_clause:
                hit.position = 'End'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
        elif q['theme_rheme'] == 'full':
            if percentage <= front_of_clause:
                hit.position = 'Front'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
                if 'Front' not in full_report:
                    full_report['Front'] = 0
                full_report['Front'] += 1
            elif front_of_clause < percentage < end_of_clause:
                hit.position = 'Middle'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
                if 'Middle' not in full_report:
                    full_report['Middle'] = 0
                full_report['Middle'] += 1
            elif percentage >= end_of_clause:
                hit.position = 'End'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
                if 'End' not in full_report:
                    full_report['End'] = 0
                full_report['End'] += 1
    return theme_rheme_hitlist(new_results), full_report
Example #29
0
from functions import write_doc, get_text
import PySimpleGUI as gui

print(gui.version)
if __name__ == "__main__":
    doc_file = gui.PopupGetFile(title='Find document',
                                message='Choose document path')
    # If the user chose a document and didn't just hit 'ok' this will then ask the user to enter the title of the docx
    # file where the bold terms will be copied to.
    if doc_file != "":
        file_name = gui.PopupGetText(
            'Enter the name of the file for bold terms to be copied to')
        ok_message = gui.Popup(custom_text='Ok', button_type=gui.Ok())

    else:
        error_message = gui.Popup(title="Error",
                                  custom_text='No Path Chosen',
                                  button_type=gui.Ok())

    text_list = get_text(doc_file)
    write_doc(text_list, file_name)
Example #30
0
def adjust_results(hits, path, q, length=2000):
    front_of_clause = 35
    end_of_clause = 90
    word = q['q']
    punctuation = re.compile('([,|?|;|.|:|!])')
    new_results = []
    full_report = {}
    for hit in hits:
        bytes, byte_start = f.format.adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        hit.concordance = theme_rheme_concordance(conc_text, bytes)
        conc_start = conc_text[:bytes[0]]
        clause_start = punctuation.split(conc_start)[-1]  # keep only last bit
        conc_end = conc_text[bytes[0]:]
        clause_end = punctuation.split(conc_end)[0]  # keep only first bit
        clause = f.format.clean_text(clause_start + clause_end)
        new_clause = [
            i for i in clause.split() if len(i) > 2 or i.lower() == word
        ]
        if len(new_clause) < 3:
            continue
        word_position = 0
        for pos, w in enumerate(new_clause):
            if w.lower() == word:
                word_position = pos + 1
                break
        clause_len = len(new_clause)
        percentage = round(word_position / clause_len * 100, 2)
        if q['theme_rheme'] == 'front' and percentage <= front_of_clause:
            hit.percentage = str(percentage) + '%'
            hit.score = str(word_position) + '/' + str(clause_len)
            hit.position = 'Front'
            new_results.append(hit)
        elif q['theme_rheme'] == 'end' and percentage >= end_of_clause:
            hit.percentage = str(percentage) + '%'
            hit.score = str(word_position) + '/' + str(clause_len)
            hit.position = 'End'
            new_results.append(hit)
        elif q['theme_rheme'] == 'front_end':
            if percentage <= front_of_clause:
                hit.position = 'Front'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
            elif percentage >= end_of_clause:
                hit.position = 'End'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
        elif q['theme_rheme'] == 'front_middle_end':
            if percentage <= front_of_clause:
                hit.position = 'Front'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
            elif front_of_clause < percentage < end_of_clause:
                hit.position = 'Middle'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
            elif percentage >= end_of_clause:
                hit.position = 'End'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
        elif q['theme_rheme'] == 'full':
            if percentage <= front_of_clause:
                hit.position = 'Front'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
                if 'Front' not in full_report:
                    full_report['Front'] = 0
                full_report['Front'] += 1
            elif front_of_clause < percentage < end_of_clause:
                hit.position = 'Middle'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
                if 'Middle' not in full_report:
                    full_report['Middle'] = 0
                full_report['Middle'] += 1
            elif percentage >= end_of_clause:
                hit.position = 'End'
                hit.percentage = str(percentage) + '%'
                hit.score = str(word_position) + '/' + str(clause_len)
                new_results.append(hit)
                if 'End' not in full_report:
                    full_report['End'] = 0
                full_report['End'] += 1
    return theme_rheme_hitlist(new_results), full_report
sock_2.listen(1)
conn_2, addr_2 = sock_2.accept()

length_hamming_word_input = functions.bin_dec(list(conn_2.recv(1024)))

conn_2.close()

# Принимаем значение кодировки исходного текста
sock_3 = socket.socket()
#sock_3.bind(('192.168.0.103', 49101))
sock_3.bind(('127.0.0.1', 49101))
sock_3.listen(1)
conn_3, adrr_3 = sock_3.accept()

encoding = functions.get_ascii_code((conn_3.recv(1024)), 16)
encoding = functions.get_text(encoding, 16, 'utf8')

conn_3.close()

# Принимаем значение длины слова для кодировки
sock_4 = socket.socket()
#sock_4.bind(('192.168.0.103', 49102))
sock_4.bind(('127.0.0.1', 49102))
sock_4.listen(1)
conn_4, adrr_4 = sock_4.accept()

word_length = functions.bin_dec(list(conn_4.recv(1024)))

# Часть 2. Выводим полученные данные на экран
print("Hamming_mas = ")
print(Hamming_mas)
Example #32
0
def fetch_collocation(results,
                      path,
                      q,
                      db,
                      word_filter=True,
                      filter_num=100,
                      full_report=True,
                      stopwords=True):
    config = f.WebConfig()
    length = config['concordance_length']
    within_x_words = q['word_num']

    ## set up filtering with stopwords or 100 most frequent terms ##
    filter_list = set([q['q']])
    if word_filter:
        if stopwords:
            filter_list_path = path + '/data/stopwords.txt'
            if os.path.isfile(filter_list_path):
                filter_words_file = open(filter_list_path)
                filter_num = float("inf")
            else:
                filter_list_path = path + '/data/frequencies/word_frequencies'
                filter_words_file = open(filter_list_path)
        else:
            filter_list_path = path + '/data/frequencies/word_frequencies'
            filter_words_file = open(filter_list_path)
        line_count = 0
        for line in filter_words_file:
            line_count += 1
            try:
                word = line.split()[0]
            except IndexError:
                continue
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                break

    ## start going though hits ##
    left_collocates = defaultdict(int)
    right_collocates = defaultdict(int)
    all_collocates = defaultdict(int)

    count = 0
    for hit in results[q['interval_start']:q['interval_end']]:
        bytes, byte_start = adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)

        ## Isolate left and right concordances
        conc_left = convert_entities(conc_text[:bytes[0]].decode(
            'utf-8', 'ignore'))
        conc_left = begin_match.sub('', conc_left)
        conc_left = start_cutoff_match.sub('', conc_left)
        conc_right = convert_entities(conc_text[bytes[-1]:].decode(
            'utf-8', 'ignore'))
        conc_right = end_match.sub('', conc_right)
        conc_right = left_truncate.sub('', conc_right)
        conc_left = strip_tags(conc_left)
        conc_right = strip_tags(conc_right)

        left_words = tokenize(conc_left, filter_list, within_x_words, 'left',
                              db)
        right_words = tokenize(conc_right, filter_list, within_x_words,
                               'right', db)

        for l_word in left_words:
            left_collocates[l_word] += 1
            all_collocates[l_word] += 1

        for r_word in right_words:
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1

    if full_report:
        return all_collocates, left_collocates, right_collocates
    else:
        return all_collocates
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True):
    length = config['concordance_length']
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore'))
    collocate_num = q['collocate_num']
    
   ## set up filtering with stopwords or 100 most frequent terms ##
    filter_list = set([q['q']])
    if word_filter:
        if stopwords:
            filter_list_path = path + '/data/stopwords.txt'
            if os.path.isfile(filter_list_path):
                filter_words_file = open(filter_list_path)
                filter_num = float("inf")
            else:
                filter_list_path = path + '/data/frequencies/word_frequencies'
                filter_words_file = open(filter_list_path)
        else:
            filter_list_path = path + '/data/frequencies/word_frequencies'
            filter_words_file = open(filter_list_path)
        line_count = 0 
        for line in filter_words_file:
            line_count += 1
            word = line.split()[0]
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                break
    
    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        
        ## Isolate left and right concordances
        conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore'))
        conc_left = begin_match.sub('', conc_left)
        conc_left = start_cutoff_match.sub('', conc_left)
        conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore'))
        conc_right = end_match.sub('', conc_right)
        conc_right = left_truncate.sub('', conc_right)
        conc_left = strip_tags(conc_left)
        conc_right = strip_tags(conc_right)
        
        if direction =='left':
            words = tokenize(conc_left, filter_list, within_x_words, direction, db)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words, direction, db)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left', db)
            words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)

        if len(new_hitlist) > (q["start"] + q["results_per_page"]):
            break
    
    h = collocation_hitlist(new_hitlist, collocate_num)
    return h
Example #34
0
def prepare_tfidf_vectorizer(corpus_file):
    corpus = get_text(corpus_file).splitlines()
    tf_idf = TfidfVectorizer(ngram_range=(1, 1), stop_words=load_stopwords())
    tf_idf.fit_transform(corpus)
    return tf_idf
Example #35
0
from functions import get_text, preprocess_text
from summa import keywords

vacancy = preprocess_text(get_text('test_vacancy.txt'))
cv = preprocess_text(get_text('test_cv.txt'))

print(keywords.keywords(vacancy, language="russian", scores=True))
print('====')
print(keywords.keywords(cv, language="russian"))
Example #36
0
def fetch_collocation(results, path, q, filter_words=100):
    within_x_words = q['word_num']

    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
            break

    ## start going though hits ##
    left_collocates = {}
    right_collocates = {}
    all_collocates = {}

    count = 0
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)

        left_words = tokenize(conc_left, filter_list, within_x_words, 'left')
        if not sum([len(i) for i in left_words]):
            count += 1
        right_words = tokenize(conc_right, filter_list, within_x_words,
                               'right')

        for l_word in left_words:
            if l_word == q['q']:
                continue
            if l_word not in left_collocates:
                left_collocates[l_word] = 0
            left_collocates[l_word] += 1
            if l_word not in all_collocates:
                all_collocates[l_word] = 0
            all_collocates[l_word] += 1

        for r_word in right_words:
            if r_word == q['q']:
                continue
            if r_word not in right_collocates:
                right_collocates[r_word] = 0
            if r_word not in all_collocates:
                all_collocates[r_word] = 0
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1

    left_out = sorted(left_collocates.items(),
                      key=lambda x: x[1],
                      reverse=True)
    right_out = sorted(right_collocates.items(),
                       key=lambda x: x[1],
                       reverse=True)
    all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)

    tuple_out = zip(all_out, left_out, right_out)
    print >> sys.stderr, "COUNT", count
    return tuple_out
Example #37
0
                0] is "<" else area_of_expertise
            # Getting active department frequencies
            if department not in department_frequency.keys():
                department_frequency[department] = 1
            else:
                department_frequency[department] += 1

            # Adding areas of expertise frequencies
            if area_of_expertise not in areas_of_expertise_frequency.keys():
                areas_of_expertise_frequency[area_of_expertise] = 1
            else:
                areas_of_expertise_frequency[area_of_expertise] += 1

            # Read contents of the job page

            page_text = get_text(html_link)
            terms_in_link = NgramBuilder.ngramExtractor(
                NgramBuilder, page_text)
            no_of_keywords_in_link = len(terms_in_link)
            list_of_terms = list(terms_in_link)
            for j in range(0, no_of_keywords_in_link):
                if (list_of_terms[j][0] not in terms.keys()):
                    terms[list_of_terms[j][0]] = 1
                else:
                    terms[list_of_terms[j][0]] += 1

required_terms = [
    "information", "sharepoint", "drupal", "wordpress", "automation", "junior",
    "senior", "information technology", "ict", "agile", "scrum", "trello",
    "jira", "java", "python", "c", "golang", "cobol", "manager", "developer",
    "baseline", "security", "citizen", "citizenship", "police verification",
# Loops until the user decides to quit
while True:
    try:
        if first_time:
            action = '0'
            first_time = False
        else:
            # Gives the user a menu of actions to choose from
            action = functions.get_action()
            # Reloads the functions file if it's been updated
            mod_time = functions.update_functions(functions, mod_time)
        # Executes the appropriate feature depending on the action choice
        if action == '0':
            # Lets the user choose a new text file
            text, filename = functions.get_text()
        elif action == '1':
            # Counts how many times a word or phrase appears
            word = functions.get_word("Please enter a word or phrase to "
                                      "find in the text: ")
            functions.print_frequency(text, word)
        elif action == '2':
            # Prints all instances of a word or phrase
            word = functions.get_word("Please enter a word or phrase to "
                                      "find in the text: ")
            functions.print_instances(text, word)
        elif action == '3':
            # Replaces a word or phrase with another word or phrase
            old_word = functions.get_word("Please enter a word or phrase to "
                                          "find in the text: ")
            new_word = functions.get_word("Please enter a word or phrase to "
Example #39
0
from functions import get_text, preprocess_text

text = get_text('skills100k.txt')
clear_text = preprocess_text(text, True)

with open("lemmatized_skills100k.txt", "w") as outfile:
    outfile.write(clear_text)
    outfile.close()
def fetch_colloc_concordance(results,
                             path,
                             q,
                             db,
                             config,
                             word_filter=True,
                             filter_num=100,
                             stopwords=True):
    length = config['concordance_length']
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = unicodedata.normalize('NFC',
                                      q['collocate'].decode('utf-8', 'ignore'))
    collocate_num = q['collocate_num']

    ## set up filtering with stopwords or 100 most frequent terms ##
    filter_list = set([q['q']])
    if word_filter:
        if stopwords:
            filter_list_path = path + '/data/stopwords.txt'
            if os.path.isfile(filter_list_path):
                filter_words_file = open(filter_list_path)
                filter_num = float("inf")
            else:
                filter_list_path = path + '/data/frequencies/word_frequencies'
                filter_words_file = open(filter_list_path)
        else:
            filter_list_path = path + '/data/frequencies/word_frequencies'
            filter_words_file = open(filter_list_path)
        line_count = 0
        for line in filter_words_file:
            line_count += 1
            word = line.split()[0]
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                break

    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, length)
        conc_text = f.get_text(hit, byte_start, length, path)

        ## Isolate left and right concordances
        conc_left = convert_entities(conc_text[:bytes[0]].decode(
            'utf-8', 'ignore'))
        conc_left = begin_match.sub('', conc_left)
        conc_left = start_cutoff_match.sub('', conc_left)
        conc_right = convert_entities(conc_text[bytes[-1]:].decode(
            'utf-8', 'ignore'))
        conc_right = end_match.sub('', conc_right)
        conc_right = left_truncate.sub('', conc_right)
        conc_left = strip_tags(conc_left)
        conc_right = strip_tags(conc_right)

        if direction == 'left':
            words = tokenize(conc_left, filter_list, within_x_words, direction,
                             db)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words,
                             direction, db)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left',
                             db)
            words.extend(
                tokenize(conc_right, filter_list, within_x_words, 'right', db))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)

        if len(new_hitlist) > (q["start"] + q["results_per_page"]):
            break

    h = collocation_hitlist(new_hitlist, collocate_num)
    return h