コード例 #1
0
ファイル: extract.py プロジェクト: harendranathvegi9/serapis
    def extract_sentences(self, page_text):
        """Finds all sentences that contain the term or a spelling variants.
        Sets self.sentences ans self.variants.

        Args:
            page_text: str
        Returns
            str -- cleaned page text.
        """
        doc = []
        for paragraph in page_text.split('\n\n'):
            if is_english(paragraph):
                for sentence in paragraph_to_sentences(paragraph, self.term):
                    if qualify_sentence(sentence):
                        doc.append(sentence)
                        s_clean, variants = clean_sentence(sentence, self.term)
                        if variants and s_clean not in [
                                s['s_clean'] for s in self.sentences
                        ]:
                            self.variants.update(variants)
                            self.sentences.append({
                                's': sentence,
                                's_clean': s_clean
                            })
        return " ".join(doc)
コード例 #2
0
ファイル: test_search.py プロジェクト: clarecorthell/serapis
def test_english_detection():
    from serapis.language import is_english
    with open("serapis/tests/data/language_detection.csv") as f:
        test_cases = list(csv.reader(f))
    for language, sentence in test_cases:
        sentence = sentence.decode('utf-8')
        detected_english = is_english(sentence)
        if 'english' == language:
            assert detected_english, "Falsely classified '{}...' as non-English".format(sentence[:40])
        else:
            assert not detected_english, "Falsely classified '{}...' as English".format(sentence[:40])
コード例 #3
0
def test_english_detection():
    from serapis.language import is_english
    with open("serapis/tests/data/language_detection.csv") as f:
        test_cases = list(csv.reader(f))
    for language, sentence in test_cases:
        sentence = sentence.decode('utf-8')
        detected_english = is_english(sentence)
        if 'english' == language:
            assert detected_english, "Falsely classified '{}...' as non-English".format(
                sentence[:40])
        else:
            assert not detected_english, "Falsely classified '{}...' as English".format(
                sentence[:40])
コード例 #4
0
ファイル: search.py プロジェクト: wordnik/serapis
def qualify_search_result(url, text, date=None):
    """Heuristically determines if a search result is worth parsing.

    Args:
        url: str
        text: str -- Preview or summary
        date: str -- ISO8601 formatted
    Returns:
        bool -- True if the search result is worth parsing.
    """
    for domain in config.exclude_domains:
        if domain in url:
            return False
    if text and not is_english(text):
        log.info("Excluded non-english result '{}'".format(text))
        return False
    parts = urlparse.urlparse(url)
    if parts.path.endswith(".pdf"):
        return False
    return True
コード例 #5
0
ファイル: search.py プロジェクト: harendranathvegi9/serapis
def qualify_search_result(url, text, date=None):
    """Heuristically determines if a search result is worth parsing.

    Args:
        url: str
        text: str -- Preview or summary
        date: str -- ISO8601 formatted
    Returns:
        bool -- True if the search result is worth parsing.
    """
    for domain in config.exclude_domains:
        if domain in url:
            return False
    if text and not is_english(text):
        log.info("Excluded non-english result '{}'".format(text))
        return False
    parts = urlparse.urlparse(url)
    if parts.path.endswith(".pdf"):
        return False
    return True
コード例 #6
0
ファイル: extract.py プロジェクト: wordnik/serapis
    def extract_sentences(self, page_text):
        """Finds all sentences that contain the term or a spelling variants.
        Sets self.sentences ans self.variants.

        Args:
            page_text: str
        Returns
            str -- cleaned page text.
        """
        doc = []
        for paragraph in page_text.split('\n\n'):
            if is_english(paragraph):
                for sentence in paragraph_to_sentences(paragraph, self.term):
                    if qualify_sentence(sentence):
                        doc.append(sentence)
                        s_clean, variants = clean_sentence(sentence, self.term)
                        if variants and s_clean not in [s['s_clean'] for s in self.sentences]:
                            self.variants.update(variants)
                            self.sentences.append({
                                's': sentence,
                                's_clean': s_clean
                            })
        return " ".join(doc)