def extract_sentences(self, page_text): """Finds all sentences that contain the term or a spelling variants. Sets self.sentences ans self.variants. Args: page_text: str Returns str -- cleaned page text. """ doc = [] for paragraph in page_text.split('\n\n'): if is_english(paragraph): for sentence in paragraph_to_sentences(paragraph, self.term): if qualify_sentence(sentence): doc.append(sentence) s_clean, variants = clean_sentence(sentence, self.term) if variants and s_clean not in [ s['s_clean'] for s in self.sentences ]: self.variants.update(variants) self.sentences.append({ 's': sentence, 's_clean': s_clean }) return " ".join(doc)
def test_english_detection(): from serapis.language import is_english with open("serapis/tests/data/language_detection.csv") as f: test_cases = list(csv.reader(f)) for language, sentence in test_cases: sentence = sentence.decode('utf-8') detected_english = is_english(sentence) if 'english' == language: assert detected_english, "Falsely classified '{}...' as non-English".format(sentence[:40]) else: assert not detected_english, "Falsely classified '{}...' as English".format(sentence[:40])
def test_english_detection(): from serapis.language import is_english with open("serapis/tests/data/language_detection.csv") as f: test_cases = list(csv.reader(f)) for language, sentence in test_cases: sentence = sentence.decode('utf-8') detected_english = is_english(sentence) if 'english' == language: assert detected_english, "Falsely classified '{}...' as non-English".format( sentence[:40]) else: assert not detected_english, "Falsely classified '{}...' as English".format( sentence[:40])
def qualify_search_result(url, text, date=None): """Heuristically determines if a search result is worth parsing. Args: url: str text: str -- Preview or summary date: str -- ISO8601 formatted Returns: bool -- True if the search result is worth parsing. """ for domain in config.exclude_domains: if domain in url: return False if text and not is_english(text): log.info("Excluded non-english result '{}'".format(text)) return False parts = urlparse.urlparse(url) if parts.path.endswith(".pdf"): return False return True
def extract_sentences(self, page_text): """Finds all sentences that contain the term or a spelling variants. Sets self.sentences ans self.variants. Args: page_text: str Returns str -- cleaned page text. """ doc = [] for paragraph in page_text.split('\n\n'): if is_english(paragraph): for sentence in paragraph_to_sentences(paragraph, self.term): if qualify_sentence(sentence): doc.append(sentence) s_clean, variants = clean_sentence(sentence, self.term) if variants and s_clean not in [s['s_clean'] for s in self.sentences]: self.variants.update(variants) self.sentences.append({ 's': sentence, 's_clean': s_clean }) return " ".join(doc)