コード例 #1
0
ファイル: lambda_handler.py プロジェクト: wordnik/serapis
def handler(event, context):
    if "term" in event:  # API call
        if not qualify_term(event['term']):
            return {'error': 'Invalid search term'}
        message = {
            "word": event['term'],
            'hashslug': hashslug(event['term'])
        }
        if "sentence" in event:  # Detect
            s_clean, variants = clean_sentence(event['sentence'], event['term'])
            message['crawl_date'] = now()
            message['urls'] = [{
                "url": event.get('url'),
                "source": get_source_from_url(event.get('url')),
                "sentences": [{
                    "s": event['sentence'],
                    "s_clean": s_clean,
                }],
                "variants": list(variants)
            }]
            return tasks.detect(message)
        else:  # Search
            return tasks.search(message)

    elif "Records" in event:  # This comes from S3
        for record in event['Records']:
            bucket = record['s3']['bucket']['name']
            key = record['s3']['object']['key']
            key = key.replace("%3A", ":")  # That's my URLDecode.
            if key.count(":") == 2:
                return run_task(bucket, key)
            elif key.endswith(".wordlist"):
                return add_words(bucket, key)
            else:
                print "Don't know what to do with '{}'".format(key)
コード例 #2
0
ファイル: extract.py プロジェクト: harendranathvegi9/serapis
    def extract_sentences(self, page_text):
        """Finds all sentences that contain the term or a spelling variants.
        Sets self.sentences ans self.variants.

        Args:
            page_text: str
        Returns
            str -- cleaned page text.
        """
        doc = []
        for paragraph in page_text.split('\n\n'):
            if is_english(paragraph):
                for sentence in paragraph_to_sentences(paragraph, self.term):
                    if qualify_sentence(sentence):
                        doc.append(sentence)
                        s_clean, variants = clean_sentence(sentence, self.term)
                        if variants and s_clean not in [
                                s['s_clean'] for s in self.sentences
                        ]:
                            self.variants.update(variants)
                            self.sentences.append({
                                's': sentence,
                                's_clean': s_clean
                            })
        return " ".join(doc)
コード例 #3
0
def test_wordnik_patterns_perc():
    from serapis.features import match_wordnik_rules
    from serapis.preprocess import clean_sentence
    min_coverage = 0.2
    matches = 0.0
    with open("serapis/tests/data/frds_wordnik.csv") as f:
        test_cases = list(csv.reader(f))
    for term, sentence in test_cases:
        s_clean, _ = clean_sentence(sentence, term)
        matches += 1 if match_wordnik_rules(s_clean) else 0
    assert matches / len(
        test_cases) > min_coverage, "Only matched {:.2f}% of data set".format(
            100 * matches / len(test_cases))
コード例 #4
0
ファイル: test_features.py プロジェクト: wordnik/serapis
def test_wordnik_patterns_perc():
    from serapis.features import match_wordnik_rules
    from serapis.preprocess import clean_sentence

    min_coverage = 0.2
    matches = 0.0
    with open("serapis/tests/data/frds_wordnik.csv") as f:
        test_cases = list(csv.reader(f))
    for term, sentence in test_cases:
        s_clean, _ = clean_sentence(sentence, term)
        matches += 1 if match_wordnik_rules(s_clean) else 0
    assert matches / len(test_cases) > min_coverage, "Only matched {:.2f}% of data set".format(
        100 * matches / len(test_cases)
    )
コード例 #5
0
ファイル: extract.py プロジェクト: wordnik/serapis
    def extract_sentences(self, page_text):
        """Finds all sentences that contain the term or a spelling variants.
        Sets self.sentences ans self.variants.

        Args:
            page_text: str
        Returns
            str -- cleaned page text.
        """
        doc = []
        for paragraph in page_text.split('\n\n'):
            if is_english(paragraph):
                for sentence in paragraph_to_sentences(paragraph, self.term):
                    if qualify_sentence(sentence):
                        doc.append(sentence)
                        s_clean, variants = clean_sentence(sentence, self.term)
                        if variants and s_clean not in [s['s_clean'] for s in self.sentences]:
                            self.variants.update(variants)
                            self.sentences.append({
                                's': sentence,
                                's_clean': s_clean
                            })
        return " ".join(doc)