def get_queries(source, num_queries=3):
    scored_chunks = []
    zip_data = zipfile.ZipFile(source).read('word/document.xml')
    try:
        # Word docs seem to always be encoded as UTF-8.
        # TODO Should really scan the encoding attribute, but for now just use this method
        xml = zip_data.decode('UTF-8')
    except UnicodeDecodeError:
        xml = zip_data.decode('ISO-8859-1')

    # Clean up the data - e.g. by replacing 'key' XML like linebreaks into actual linebreaks
    text = xml.replace('<w:br/>', " \r\n")
    text = text.replace('</w:r></w:p></w:tc><w:tc>', " ")
    text = text.replace('</w:r><w:proofErr w:type="gramEnd"/></w:p>', " \r\n")
    text = text.replace('</w:r></w:p>', " \r\n")
    text = re.sub(r'<w:hyperlink.*?<w:t>(.*?)</w:t>.*?</w:hyperlink>', r' \1 ',
                  text)  # extract hyperlink text
    text = re.sub(r'<w:instrText.*?</w:instrText>', '',
                  text)  # remove 'instruction text' fields
    text = re.sub(r'HYPERLINK ".*?"', '', text)
    text = strip_tags(text)

    scored_chunks = []

    for chunk in split_into_chunks(text, filter_poor_quality=True):
        score = calculate_unique_score_for_chunk(chunk)
        scored_chunks.append((remove_special_characters(chunk), score))

    return build_query_result(scored_chunks, num_queries, source=text)
def get_queries(url, num_queries=3):
    scored_chunks = []
    initial = get_htmlparsed_snippets(url)

    # Return now if there's been a failure (e.g. a HTTP 404 code)
    if initial['success'] is False:
        return initial

    for i in initial['data']:
        # Cleanup string by stripping out whitespace, special chars etc
        text, score = remove_special_characters(i[0]), i[1]

        # Improve scores by adding points for the overall number of words for this snippet
        text_len = len(text.split())
        if 5 <= text_len < 8:
            score += 3
        elif 8 <= text_len < 20:
            score += 4
        elif 20 <= text_len < 100:
            score += 5
        elif text_len >= 100:
            score += 6

        for chunk in split_into_chunks(text):
            # Add 2 to the score since the HTML surrounding elements checks increase the scores c.f. other content types
            chunk_score = score + calculate_unique_score_for_chunk(chunk) + 2
            scored_chunks.append((chunk, chunk_score))

    return build_query_result(scored_chunks, num_queries, initial['source'])
def get_queries(source, num_queries=3):
    scored_chunks = []
    zip_file = zipfile.ZipFile(source)
    zip_files = zip_file.namelist()
    pattern = r'ppt/slides/slide\d+.xml' # each slide has the format ppt/slides/slide[int].xml
    all_slides = [slide for slide in zip_files if re.search(pattern, slide)]
    all_slides = natsort.natsorted(all_slides, key=lambda y: y.lower())

    scored_chunks = []
    source_text = ''
    for slide in all_slides:
        slide_data = zip_file.read(slide)

        try:
            # Powerpoint presentations seem to always be encoded as UTF-8.
            # Should really scan the encoding attribute, but for now just use this method
            xml = slide_data.decode('UTF-8')
        except UnicodeDecodeError:
            xml = slide_data.decode('ISO-8859-1')

        text = xml.replace('</a:t></a:r>', ' ')
        text = re.sub(r'<p:attrNameLst>.*?</p:attrNameLst>', '', text)
        text = re.sub(r'<a:fld id=".*?" type="slidenum">.*?</a:fld>', '', text)
        text = strip_tags(text)
        source_text += text

        for chunk in split_into_chunks(text, filter_poor_quality=True):
            score = calculate_unique_score_for_chunk(chunk)
            scored_chunks.append((remove_special_characters(chunk), score))

    return build_query_result(scored_chunks, num_queries, source=source_text)
def get_queries(source, num_queries=3):
    scored_chunks = []

    for chunk in split_into_chunks(source, filter_poor_quality=True):
        score = calculate_unique_score_for_chunk(chunk)
        scored_chunks.append((remove_special_characters(chunk), score))

    return build_query_result(scored_chunks, num_queries, source=source)
Exemple #5
0
def get_queries(filename, num_queries=3):
    scored_chunks = []
    absolute_file_path = os.path.join(settings.MEDIA_ROOT, filename)
    doc_to_text_output = subprocess.check_output(
        [settings.DOC_TO_TEXT, absolute_file_path])
    try:
        text = doc_to_text_output.decode('utf-8')
    except UnicodeDecodeError:
        text = doc_to_text_output.decode('ISO-8859-1')

    scored_chunks = []

    for chunk in split_into_chunks(text, filter_poor_quality=True):
        score = calculate_unique_score_for_chunk(chunk)
        scored_chunks.append((remove_special_characters(chunk), score))

    return build_query_result(scored_chunks, num_queries, source=text)
def get_queries(filename, num_queries=3):
    '''
    :param filename: The filename for the PDF
    :param num_queries: the number of items in the list to return
    :return: a list of tuples containing the chunk (query) and its score
    '''
    scored_chunks = []
    absolute_file_path = os.path.join(settings.MEDIA_ROOT, filename)
    pdf_to_text_output = subprocess.check_output([settings.PDF_TO_TEXT, "-layout", absolute_file_path, "-"])
    try:
        text = pdf_to_text_output.decode('utf-8')
    except UnicodeDecodeError:
        text = pdf_to_text_output.decode('ISO-8859-1')

    for chunk in split_into_chunks(remove_special_characters(text)):
        ''' Since PDF extraction can be messy and lead to bad results, we use a non standard scoring system (for now):
                +1 if there's 8+ words of length >= 3 characters
                +1 if 75% or more of the non-whitespace characters are alphabetic
            This is to eliminate any clearly crap results '''
        num_words_len3 = 0
        chunk_words = chunk.split()
        for word in chunk_words:
            if len(word) >= 3:
                num_words_len3 += 1

        score = (1 if num_words_len3 >= 8 else 0)

        word_no_whitespace = chunk.replace(' ', '')
        word_only_alpha = re.sub(r'[^a-zA-Z]+', '', word_no_whitespace)
        if len(word_only_alpha) / len(word_no_whitespace) > 0.75:
            score += 1

        scored_chunks.append((chunk, score))

    # Okay, if there's > num_queries chunks with a score of 2, now we'll use uniqueness scoring
    full_score_chunks = [chunk for chunk in scored_chunks if chunk[1] == 2]

    if len(full_score_chunks) > num_queries:
        scored_chunks = []

        for scored_chunk in full_score_chunks:
            unique_score = calculate_unique_score_for_chunk(scored_chunk[0])
            scored_chunks.append((scored_chunk[0], scored_chunk[1]+unique_score))

    return build_query_result(scored_chunks, num_queries, source=text)