def get_queries(source, num_queries=3): scored_chunks = [] zip_data = zipfile.ZipFile(source).read('word/document.xml') try: # Word docs seem to always be encoded as UTF-8. # TODO Should really scan the encoding attribute, but for now just use this method xml = zip_data.decode('UTF-8') except UnicodeDecodeError: xml = zip_data.decode('ISO-8859-1') # Clean up the data - e.g. by replacing 'key' XML like linebreaks into actual linebreaks text = xml.replace('<w:br/>', " \r\n") text = text.replace('</w:r></w:p></w:tc><w:tc>', " ") text = text.replace('</w:r><w:proofErr w:type="gramEnd"/></w:p>', " \r\n") text = text.replace('</w:r></w:p>', " \r\n") text = re.sub(r'<w:hyperlink.*?<w:t>(.*?)</w:t>.*?</w:hyperlink>', r' \1 ', text) # extract hyperlink text text = re.sub(r'<w:instrText.*?</w:instrText>', '', text) # remove 'instruction text' fields text = re.sub(r'HYPERLINK ".*?"', '', text) text = strip_tags(text) scored_chunks = [] for chunk in split_into_chunks(text, filter_poor_quality=True): score = calculate_unique_score_for_chunk(chunk) scored_chunks.append((remove_special_characters(chunk), score)) return build_query_result(scored_chunks, num_queries, source=text)
def get_queries(url, num_queries=3): scored_chunks = [] initial = get_htmlparsed_snippets(url) # Return now if there's been a failure (e.g. a HTTP 404 code) if initial['success'] is False: return initial for i in initial['data']: # Cleanup string by stripping out whitespace, special chars etc text, score = remove_special_characters(i[0]), i[1] # Improve scores by adding points for the overall number of words for this snippet text_len = len(text.split()) if 5 <= text_len < 8: score += 3 elif 8 <= text_len < 20: score += 4 elif 20 <= text_len < 100: score += 5 elif text_len >= 100: score += 6 for chunk in split_into_chunks(text): # Add 2 to the score since the HTML surrounding elements checks increase the scores c.f. other content types chunk_score = score + calculate_unique_score_for_chunk(chunk) + 2 scored_chunks.append((chunk, chunk_score)) return build_query_result(scored_chunks, num_queries, initial['source'])
def get_queries(source, num_queries=3): scored_chunks = [] zip_file = zipfile.ZipFile(source) zip_files = zip_file.namelist() pattern = r'ppt/slides/slide\d+.xml' # each slide has the format ppt/slides/slide[int].xml all_slides = [slide for slide in zip_files if re.search(pattern, slide)] all_slides = natsort.natsorted(all_slides, key=lambda y: y.lower()) scored_chunks = [] source_text = '' for slide in all_slides: slide_data = zip_file.read(slide) try: # Powerpoint presentations seem to always be encoded as UTF-8. # Should really scan the encoding attribute, but for now just use this method xml = slide_data.decode('UTF-8') except UnicodeDecodeError: xml = slide_data.decode('ISO-8859-1') text = xml.replace('</a:t></a:r>', ' ') text = re.sub(r'<p:attrNameLst>.*?</p:attrNameLst>', '', text) text = re.sub(r'<a:fld id=".*?" type="slidenum">.*?</a:fld>', '', text) text = strip_tags(text) source_text += text for chunk in split_into_chunks(text, filter_poor_quality=True): score = calculate_unique_score_for_chunk(chunk) scored_chunks.append((remove_special_characters(chunk), score)) return build_query_result(scored_chunks, num_queries, source=source_text)
def get_queries(source, num_queries=3): scored_chunks = [] for chunk in split_into_chunks(source, filter_poor_quality=True): score = calculate_unique_score_for_chunk(chunk) scored_chunks.append((remove_special_characters(chunk), score)) return build_query_result(scored_chunks, num_queries, source=source)
def get_queries(filename, num_queries=3): scored_chunks = [] absolute_file_path = os.path.join(settings.MEDIA_ROOT, filename) doc_to_text_output = subprocess.check_output( [settings.DOC_TO_TEXT, absolute_file_path]) try: text = doc_to_text_output.decode('utf-8') except UnicodeDecodeError: text = doc_to_text_output.decode('ISO-8859-1') scored_chunks = [] for chunk in split_into_chunks(text, filter_poor_quality=True): score = calculate_unique_score_for_chunk(chunk) scored_chunks.append((remove_special_characters(chunk), score)) return build_query_result(scored_chunks, num_queries, source=text)
def get_queries(filename, num_queries=3): ''' :param filename: The filename for the PDF :param num_queries: the number of items in the list to return :return: a list of tuples containing the chunk (query) and its score ''' scored_chunks = [] absolute_file_path = os.path.join(settings.MEDIA_ROOT, filename) pdf_to_text_output = subprocess.check_output([settings.PDF_TO_TEXT, "-layout", absolute_file_path, "-"]) try: text = pdf_to_text_output.decode('utf-8') except UnicodeDecodeError: text = pdf_to_text_output.decode('ISO-8859-1') for chunk in split_into_chunks(remove_special_characters(text)): ''' Since PDF extraction can be messy and lead to bad results, we use a non standard scoring system (for now): +1 if there's 8+ words of length >= 3 characters +1 if 75% or more of the non-whitespace characters are alphabetic This is to eliminate any clearly crap results ''' num_words_len3 = 0 chunk_words = chunk.split() for word in chunk_words: if len(word) >= 3: num_words_len3 += 1 score = (1 if num_words_len3 >= 8 else 0) word_no_whitespace = chunk.replace(' ', '') word_only_alpha = re.sub(r'[^a-zA-Z]+', '', word_no_whitespace) if len(word_only_alpha) / len(word_no_whitespace) > 0.75: score += 1 scored_chunks.append((chunk, score)) # Okay, if there's > num_queries chunks with a score of 2, now we'll use uniqueness scoring full_score_chunks = [chunk for chunk in scored_chunks if chunk[1] == 2] if len(full_score_chunks) > num_queries: scored_chunks = [] for scored_chunk in full_score_chunks: unique_score = calculate_unique_score_for_chunk(scored_chunk[0]) scored_chunks.append((scored_chunk[0], scored_chunk[1]+unique_score)) return build_query_result(scored_chunks, num_queries, source=text)